diff --git a/inferrt/src/CMakeLists.txt b/inferrt/src/CMakeLists.txt index 901648c4d6d214cf82f1270512295e81bf90a633..55da6881fe3687196572a72b6a8b83fb6e27a599 100644 --- a/inferrt/src/CMakeLists.txt +++ b/inferrt/src/CMakeLists.txt @@ -41,11 +41,12 @@ add_subdirectory(optimize) add_subdirectory(pybind) add_subdirectory(runtime) -set(OBJECTS lexer_obj parser_obj ir_obj compiler_obj vm_obj tensor_obj runtime_obj ops_obj pass_obj) +set(OBJECTS lexer_obj parser_obj ir_obj compiler_obj vm_obj tensor_obj runtime_obj ops_obj pass_obj hardware_abstract_obj hardware_ascend_obj) +set(TEST_OBJECTS test_obj) # Create da execution file -add_executable(da lang/cli/main.cc lang/cli/options.cc) -target_link_libraries(da ${OBJECTS} stdc++fs) +# add_executable(da lang/cli/main.cc lang/cli/options.cc) +# target_link_libraries(da ${OBJECTS} stdc++fs) # Create shared library. target_compile_options(inferrt PRIVATE -fPIC) add_library(inferrt SHARED lang/api/c_api.cc) diff --git a/inferrt/src/common/common.h b/inferrt/src/common/common.h index 78c58f9101ff7bcdf902433f619692be71459e43..16cbf7cf6fafebf5938621e1fccc72274573f049 100644 --- a/inferrt/src/common/common.h +++ b/inferrt/src/common/common.h @@ -23,7 +23,6 @@ #include #include #include - #include "common/logger.h" #define ENDL '\n' @@ -48,6 +47,146 @@ ; \ ; +#define DISABLE_COPY_AND_ASSIGN(ClassType) \ + ClassType(const ClassType &) = delete; \ + ClassType &operator=(const ClassType &) = delete; + +inline uint32_t LongToUint(int64_t u) { + if (u < 0) { + LOG_ERROR << "The int64_t value(" << u << ") is less than 0."; + } + if (u > static_cast((std::numeric_limits::max)())) { + LOG_ERROR << "The int64_t value(" << u << ") exceeds the maximum value of uint32_t."; + } + return static_cast(u); +} + +inline size_t FloatToSize(float u) { + if (u < 0) { + LOG_ERROR << "The float value(" << u << ") is less than 0."; + } + + if (u > static_cast((std::numeric_limits::max)())) { + LOG_ERROR << "The float value(" << u << ") exceeds the maximum value of size_t."; + } + return static_cast(u); +} +inline float IntToFloat(int32_t v) { return static_cast(v); } + +inline size_t LongToSize(int64_t u) { + if (u < 0) { + LOG_ERROR << "The int64_t value(" << u << ") is less than 0."; + } + return static_cast(u); +} + +inline int FloatToInt(float u) { + if (u > static_cast((std::numeric_limits::max)())) { + LOG_ERROR << "The float value(" << u << ") exceeds the maximum value of int."; + } + return static_cast(u); +} + +inline int FloatToLong(float u) { + if (u > static_cast((std::numeric_limits::max)())) { + LOG_ERROR << "The float value(" << u << ") exceeds the maximum value of int64_t."; + } + return static_cast(u); +} + +inline int64_t DoubleToLong(double u) { + if (u > static_cast((std::numeric_limits::max)())) { + LOG_ERROR << "The double value(" << u << ") exceeds the maximum value of int64_t."; + } + return static_cast(u); +} + +inline float SizeToFloat(size_t v) { return static_cast(v); } + +inline uint64_t SizeToUlong(size_t u) { return static_cast(u); } + +inline int SizeToInt(size_t u) { + if (u > static_cast((std::numeric_limits::max)())) { + LOG_ERROR << "The size_t value(" << u << ") exceeds the maximum value of int."; + } + return static_cast(u); +} + +inline uint32_t SizeToUint(size_t u) { + if (u > static_cast((std::numeric_limits::max)())) { + LOG_ERROR << "The size_t value(" << u << ") exceeds the maximum value of uint32_t."; + } + return static_cast(u); +} + +inline int64_t SizeToLong(size_t u) { + if (u > static_cast((std::numeric_limits::max)())) { + LOG_ERROR << "The size_t value(" << u << ") exceeds the maximum value of int64_t."; + } + return static_cast(u); +} + +inline double LongToDouble(int64_t v) { return static_cast(v); } + +inline float LongToFloat(int64_t v) { return static_cast(v); } + +inline double FloatToDouble(float v) { return static_cast(v); } + +inline uint32_t IntToUint(int32_t u) { + if (u < 0) { + LOG_ERROR << "The int32_t value(" << u << ") is less than 0."; + } + return static_cast(u); +} + +inline int32_t UintToInt(uint32_t u) { + if (u > static_cast((std::numeric_limits::max)())) { + LOG_ERROR << "The uint32_t value(" << u << ") exceeds the maximum value of int32_t."; + } + return static_cast(u); +} + +inline uint64_t LongToUlong(int64_t u) { + if (u < 0) { + LOG_ERROR << "The int64_t value(" << u << ") is less than 0."; + } + return static_cast(u); +} + +inline int32_t LongToInt(int64_t u) { + if (u > static_cast((std::numeric_limits::max)())) { + LOG_ERROR << "The size_t value(" << u << ") exceeds the maximum value of int."; + } + return static_cast(u); +} + +inline int64_t IntToLong(int32_t v) { return static_cast(v); } + +inline int64_t UlongToLong(uint64_t u) { + if (u > static_cast((std::numeric_limits::max)())) { + LOG_ERROR << "The uint64_t value(" << u << ") exceeds the maximum value of int64_t."; + } + return static_cast(u); +} + +inline unsigned int UlongToUint(uint64_t u) { + if (u > static_cast((std::numeric_limits::max)())) { + LOG_ERROR << "The size_t value(" << u << ") exceeds the maximum value of unsigned int."; + } + return static_cast(u); +} + +inline uint8_t *AddressOffset(void *address, size_t offset) { + CHECK_IF_NULL(address); + return static_cast(address) + offset; +} + +inline size_t CalAddressOffset(void *dst_address, void *ori_address) { + CHECK_IF_NULL(dst_address); + CHECK_IF_NULL(ori_address); + return static_cast(dst_address) - static_cast(ori_address); +} + static inline void CompileMessage(const std::string &filename, const int line, const int col, const std::string &msg) { std::cout << filename << ':' << line << ':' << (col + 1) << ": " << msg << '\n'; } @@ -73,6 +212,16 @@ static inline size_t SkipWhiteSpace(const char *str) { return pos; } +#ifndef MS_UNLIKELY +#ifdef _MSC_VER +#define MS_UNLIKELY(x) (x) +#define MS_LIKELY(x) (x) +#else +#define MS_LIKELY(x) __builtin_expect(!!(x), 1) +#define MS_UNLIKELY(x) __builtin_expect(!!(x), 0) +#endif +#endif + template int FindNameIndex(const char *str, T *table, size_t tableSize) { const auto strLen = strlen(str); diff --git a/inferrt/src/hardware/CMakeLists.txt b/inferrt/src/hardware/CMakeLists.txt index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..684141638e951733347e4c69a168c2553358c012 100644 --- a/inferrt/src/hardware/CMakeLists.txt +++ b/inferrt/src/hardware/CMakeLists.txt @@ -0,0 +1,5 @@ + +add_subdirectory(hardware_abstract) +add_subdirectory(ascend) +add_dependencies(hardware_ascend_obj hardware_abstract_obj) +add_subdirectory(tests) diff --git a/inferrt/src/hardware/ascend/CMakeLists.txt b/inferrt/src/hardware/ascend/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..d2d98dc376af0179525a2ec27a380f060450445e --- /dev/null +++ b/inferrt/src/hardware/ascend/CMakeLists.txt @@ -0,0 +1,21 @@ +check_debug_log_out() + +if(DEFINED ENV{ASCEND_CUSTOM_PATH}) + set(ASCEND_PATH $ENV{ASCEND_CUSTOM_PATH}) +else() + set(ASCEND_PATH /usr/local/Ascend) +endif() + +message("Note compile ascend path: ${ASCEND_PATH}") +include_directories(${ASCEND_PATH}/latest/include/) + +find_package(Python3 COMPONENTS Interpreter Development REQUIRED) +include_directories(${Python3_INCLUDE_DIRS}) + + +set(depname "pybind11") +set(PYBIND11_PATH "${PROJECT_SOURCE_DIR}/${depname}-src") +include_directories(${PYBIND11_PATH}/include) + +file(GLOB_RECURSE HARDWARE_ASCEND_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc") +add_library(hardware_ascend_obj STATIC ${HARDWARE_ASCEND_SRC_FILES}) \ No newline at end of file diff --git a/inferrt/src/hardware/ascend/ascend_device_context.cc b/inferrt/src/hardware/ascend/ascend_device_context.cc new file mode 100644 index 0000000000000000000000000000000000000000..6e50a7d4d3d471d0939030a4840c9a8b5b31f1c5 --- /dev/null +++ b/inferrt/src/hardware/ascend/ascend_device_context.cc @@ -0,0 +1,129 @@ +/** + * Copyright 2022-2025 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "hardware/ascend/ascend_device_context.h" +#include +#include +#include +#include +#include +#include +#include "hardware/hardware_abstract/common.h" +#include "hardware/hardware_abstract/device_context_manager.h" +#include "hardware/ascend/res_manager/symbol_interface/acl_base_symbol.h" +#include "hardware/ascend/res_manager/symbol_interface/acl_rt_symbol.h" +#include "hardware/ascend/res_manager/symbol_interface/symbol_utils.h" +#include "hardware/ascend/res_manager/symbol_interface/acl_compiler_symbol.h" +#include "common/common.h" + +namespace mindspore { +namespace device { +namespace ascend { +namespace { +constexpr auto kSaturationMode = "Saturation"; +constexpr auto kINFNANMode = "INFNAN"; +const char kAscendDevice[] = "Ascend"; +} // namespace + +void AscendDeviceContext::InitializeForAclop() const { + if (initialized_aclop_) { + return; + } + + LOG_OUT << "Start initializing for acl."; + LoadAscendApiSymbols(); + // if (!UseSimulationApi()) { + // auto ms_context = MsContext::GetInstance(); + // CHECK_IF_NULL(ms_context); + // auto device_id = ms_context->get_param(MS_CTX_DEVICE_ID); + // device::DeviceContextKey host_key = {device::GetDeviceNameByType(device::DeviceType::kAscend), device_id}; + // device::DeviceContext *host_context = + // device::DeviceContextManager::GetInstance().GetOrCreateDeviceContext(host_key); + // CHECK_IF_NULL(host_context); + // CHECK_IF_NULL(host_context->device_res_manager_); + // auto ascend_res_manager = dynamic_cast(host_context->device_res_manager_.get()); ascend_res_manager->InitializeForGe(); + // } + + // initialized_aclop_ = true; + LOG_OUT << "End initializing for acl."; +} + +void AscendDeviceContext::Initialize() { + GilReleaseWithCheck gil_release; + std::lock_guard lock(init_mutex_); + if (initialized_) { + return; + } + + LOG_OUT << "Start initializing device context."; + LoadAscendApiSymbols(); + // set overflow mode + // auto ms_context = MsContext::GetInstance(); + // CHECK_IF_NULL(ms_context); + // const auto &soc_version = ms_context->ascend_soc_version(); + // if (soc_version == "ascend910b" || soc_version == "ascend910_93") { + // bool is_sat = (common::GetEnv("MS_ASCEND_CHECK_OVERFLOW_MODE") == "SATURATION_MODE"); + // auto mode = (is_sat) ? aclrtFloatOverflowMode::ACL_RT_OVERFLOW_MODE_SATURATION + // : aclrtFloatOverflowMode::ACL_RT_OVERFLOW_MODE_INFNAN; + // auto overflow_mode = (is_sat) ? kSaturationMode : kINFNANMode; + // MS_LOG(INFO) << "The current overflow detection mode is " << overflow_mode << "."; + // auto ret = CALL_ASCEND_API(aclrtSetDeviceSatMode, mode); + // if (ret != ACL_SUCCESS) { + // MS_LOG(EXCEPTION) << "Set " << overflow_mode << " mode failed."; + // } + // } + + CHECK_IF_NULL(device_res_manager_); + device_res_manager_->Initialize(); + + // set MS_CTX_ENABLE_GE_HETEROGENOUS true according to heterogeneous mode + // ms_context->set_param(MS_CTX_ENABLE_GE_HETEROGENOUS, false); + + // if (ms_context->GetBackend() == kBackendGE) { + // InitializeForAclop(); + // } + + initialized_ = true; + pid_ = getpid(); // set the pid when first initialize + LOG_OUT << "End initializing device context."; +} + +void AscendDeviceContext::Destroy() { + if (pid_ != getpid()) { + // Check whether the device context needs to be released. + // The device context is copied by the dataset independent process, but does not need to be released + // in the dataset independent process. + // The device context is copied from main process by fork + LOG_OUT << "The device context is not initialized by current process, it doesn't need to be destroyed."; + return; + } + + if (device_res_manager_ == nullptr) { + return; + } + // Device resource manager must be destroyed before 'FinalizeGe' unless some runtime APIs will throw exception. + // for ge, has destropy in graph_executor->finalize + device_res_manager_->Destroy(); + // device::ascend::AclnnFinalize(); + + initialized_ = false; +} + +MS_REGISTER_DEVICE(kAscendDevice, AscendDeviceContext); +} // namespace ascend +} // namespace device +} // namespace mindspore diff --git a/inferrt/src/hardware/ascend/ascend_device_context.h b/inferrt/src/hardware/ascend/ascend_device_context.h new file mode 100644 index 0000000000000000000000000000000000000000..2a2b3c6f976369134520079f6f164bf254f2daf8 --- /dev/null +++ b/inferrt/src/hardware/ascend/ascend_device_context.h @@ -0,0 +1,53 @@ +/** + * Copyright 2022-2025 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_CCSRC_RUNTIME_HARDWARE_ASCEND_ASCEND_DEVICE_CONTEXT_H_ +#define MINDSPORE_CCSRC_RUNTIME_HARDWARE_ASCEND_ASCEND_DEVICE_CONTEXT_H_ + +#include +#include +#include +#include "common/common.h" +#include "hardware/hardware_abstract/device_context.h" +#include "hardware/hardware_abstract/memory_manager.h" +#include "hardware/ascend/res_manager/ascend_res_manager.h" + +namespace mindspore { +namespace device { +namespace ascend { +class AscendResManager; + +class AscendDeviceContext : public DeviceInterface { + public: + explicit AscendDeviceContext(const DeviceContextKey &device_context_key) : DeviceInterface(device_context_key) {} + ~AscendDeviceContext() override = default; + + void Initialize() override; + + void InitializeForAclop() const; + + void Destroy() override; + + private: + DISABLE_COPY_AND_ASSIGN(AscendDeviceContext); + + mutable bool initialized_aclop_{false}; + pid_t pid_; // Indicates the process id which creates the context. +}; +} // namespace ascend +} // namespace device +} // namespace mindspore + +#endif // MINDSPORE_CCSRC_RUNTIME_HARDWARE_ASCEND_ASCEND_DEVICE_CONTEXT_H_ diff --git a/inferrt/src/hardware/ascend/res_manager/ascend_event.cc b/inferrt/src/hardware/ascend/res_manager/ascend_event.cc new file mode 100644 index 0000000000000000000000000000000000000000..29d645a16d6d72aa654ae87f66ca9bf759df975b --- /dev/null +++ b/inferrt/src/hardware/ascend/res_manager/ascend_event.cc @@ -0,0 +1,204 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "hardware/ascend/res_manager/ascend_event.h" +#include +#include +#include "hardware/ascend/res_manager/ascend_stream_manager.h" + +#include "common/common.h" +#include "hardware/ascend/res_manager/symbol_interface/acl_rt_symbol.h" +#include "hardware/ascend/res_manager/symbol_interface/symbol_utils.h" + +namespace mindspore::device::ascend { +AscendEvent::AscendEvent() { + auto ret = CALL_ASCEND_API(aclrtCreateEvent, &event_); + if (ret != ACL_SUCCESS) { + LOG_ERROR << "aclrtCreateEvent failed, ret:" << ret; + event_ = nullptr; + } +} + +AscendEvent::AscendEvent(uint32_t flag, bool use_extensional_api) { + aclError ret; + if (use_extensional_api) { + ret = CALL_ASCEND_API(aclrtCreateEventExWithFlag, &event_, flag); + } else { + ret = CALL_ASCEND_API(aclrtCreateEventWithFlag, &event_, flag); + } + if (ret != ACL_SUCCESS) { + LOG_ERROR << "aclrtCreateEventExWithFlag failed, ret:" << ret; + event_ = nullptr; + } + has_flag_ = true; + LOG_OUT << "Create ascend event success, flag : " << flag << "."; +} + +AscendTimeEvent::AscendTimeEvent() { + auto ret = CALL_ASCEND_API(aclrtCreateEventWithFlag, &event_, ACL_EVENT_TIME_LINE); + if (ret != ACL_SUCCESS) { + LOG_ERROR << "aclrtCreateEvent failed, ret:" << ret; + event_ = nullptr; + } +} + +AscendEvent::~AscendEvent() { + if (!event_destroyed_) { + auto ret = CALL_ASCEND_API(aclrtDestroyEvent, event_); + if (ret != ACL_SUCCESS) { + LOG_ERROR << "aclrtDestroyEvent failed, ret:" << ret; + } + } + + event_ = nullptr; + wait_stream_ = nullptr; + record_stream_ = nullptr; +} + +bool AscendEvent::IsReady() const { return event_ != nullptr; } + +void AscendEvent::RecordEvent() { + CHECK_IF_NULL(event_); + CHECK_IF_NULL(record_stream_); + auto ret = CALL_ASCEND_API(aclrtRecordEvent, event_, record_stream_); + if (ret != ACL_SUCCESS) { + LOG_ERROR << "aclrtRecordEvent failed, ret:" << ret; + } + need_wait_ = true; +} + +void AscendEvent::RecordEvent(uint32_t stream_id) { + LOG_OUT << "Ascend record event on stream id : " << stream_id << "."; + CHECK_IF_NULL(event_); + record_stream_ = AscendStreamMng::GetInstance().GetStream(stream_id); + CHECK_IF_NULL(record_stream_); + auto ret = CALL_ASCEND_API(aclrtRecordEvent, event_, record_stream_); + if (ret != ACL_SUCCESS) { + LOG_ERROR << "aclrtRecordEvent failed, ret:" << ret; + } + need_wait_ = true; +} + +void AscendEvent::WaitEvent() { + CHECK_IF_NULL(event_); + CHECK_IF_NULL(wait_stream_); + auto ret = CALL_ASCEND_API(aclrtStreamWaitEvent, wait_stream_, event_); + if (ret != ACL_SUCCESS) { + LOG_ERROR << "aclrtStreamWaitEvent failed, ret:" << ret; + } + if (!has_flag_) { + // The event created by aclrtCreateEventExWithFlag is not support to call + // aclrtResetEvent/aclrtQueryEvent/aclrtQueryEventWaitStatus. + LOG_OUT << "Reset Event"; + ret = CALL_ASCEND_API(aclrtResetEvent, event_, wait_stream_); + if (ret != ACL_SUCCESS) { + LOG_ERROR << "aclrtResetEvent failed, ret:" << ret; + } + } + need_wait_ = false; +} + +bool AscendEvent::WaitEvent(uint32_t stream_id) { + LOG_OUT << "Ascend wait event on stream id : " << stream_id << "."; + wait_stream_ = AscendStreamMng::GetInstance().GetStream(stream_id); + auto ret = CALL_ASCEND_API(aclrtStreamWaitEvent, wait_stream_, event_); + if (ret != ACL_SUCCESS) { + LOG_ERROR << "aclrtStreamWaitEvent failed, ret:" << ret; + } + if (!has_flag_) { + // Reset event after wait so that event can be reused. + ret = CALL_ASCEND_API(aclrtResetEvent, event_, wait_stream_); + if (ret != ACL_SUCCESS) { + LOG_ERROR << "aclrtResetEvent failed, ret:" << ret; + } + } + need_wait_ = false; + return true; +} + +void AscendEvent::WaitEventWithoutReset() { + CHECK_IF_NULL(event_); + CHECK_IF_NULL(wait_stream_); + // Query result will be reset after aclrtResetEvent is called. + auto ret = CALL_ASCEND_API(aclrtStreamWaitEvent, wait_stream_, event_); + if (ret != ACL_SUCCESS) { + LOG_ERROR << "aclrtStreamWaitEvent failed, ret:" << ret; + } + need_wait_ = false; +} + +void AscendEvent::WaitEventWithoutReset(uint32_t stream_id) { + wait_stream_ = AscendStreamMng::GetInstance().GetStream(stream_id); + WaitEventWithoutReset(); +} + +void AscendEvent::ResetEvent() { + CHECK_IF_NULL(event_); + CHECK_IF_NULL(wait_stream_); + + LOG_OUT << "Reset Event"; + auto ret = CALL_ASCEND_API(aclrtResetEvent, event_, wait_stream_); + if (ret != ACL_SUCCESS) { + LOG_ERROR << "aclrtResetEvent failed, ret:" << ret; + } +} + +void AscendEvent::ResetEvent(uint32_t stream_id) { + wait_stream_ = AscendStreamMng::GetInstance().GetStream(stream_id); + ResetEvent(); +} + +void AscendEvent::SyncEvent() { + CHECK_IF_NULL(event_); + auto ret = CALL_ASCEND_API(aclrtSynchronizeEvent, event_); + if (ret != ACL_SUCCESS) { + LOG_ERROR << "aclrtSynchronizeEvent failed, ret:" << ret; + } +} + +bool AscendEvent::QueryEvent() { + CHECK_IF_NULL(event_); + aclrtEventRecordedStatus status; + auto ret = CALL_ASCEND_API(aclrtQueryEventStatus, event_, &status); + if (ret != ACL_SUCCESS) { + LOG_ERROR << "aclQueryEventStatus failed, ret:" << ret; + } + return status == ACL_EVENT_RECORDED_STATUS_COMPLETE; +} + +void AscendEvent::ElapsedTime(float *cost_time, const DeviceEvent *other) { + CHECK_IF_NULL(event_); + auto ascend_other = static_cast(other); + CHECK_IF_NULL(ascend_other); + CHECK_IF_NULL(ascend_other->event_); + auto ret = CALL_ASCEND_API(aclrtEventElapsedTime, cost_time, event_, ascend_other->event_); + if (ret != ACL_SUCCESS) { + LOG_ERROR << "aclrtEventElapsedTime failed, ret:" << ret; + } +} + +bool AscendEvent::NeedWait() { return need_wait_; } + +bool AscendEvent::DestroyEvent() { + CHECK_IF_NULL(event_); + auto ret = CALL_ASCEND_API(aclrtDestroyEvent, event_); + if (ret != ACL_SUCCESS) { + LOG_ERROR << "aclrtDestroyEvent failed, ret:" << ret; + } + event_destroyed_ = true; + return true; +} +} // namespace mindspore::device::ascend diff --git a/inferrt/src/hardware/ascend/res_manager/ascend_event.h b/inferrt/src/hardware/ascend/res_manager/ascend_event.h new file mode 100644 index 0000000000000000000000000000000000000000..8dc5f87d04ed1ab113732ce8a995186904dfa0f4 --- /dev/null +++ b/inferrt/src/hardware/ascend/res_manager/ascend_event.h @@ -0,0 +1,67 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_ASCEND_EVENT_H +#define MINDSPORE_ASCEND_EVENT_H + +#include "hardware/hardware_abstract/device_event.h" +#include "acl/acl_rt.h" +#include "hardware/hardware_abstract/visible.h" + +namespace mindspore::device::ascend { +constexpr uint32_t ACL_EVENT_DEFAULT = 0x0000000Eu; + +class HARDWARE_EXPORT AscendEvent : public DeviceEvent { + public: + AscendEvent(); + explicit AscendEvent(uint32_t flag, bool use_extensional_api = true); + ~AscendEvent() override; + + bool IsReady() const override; + void WaitEvent() override; + bool WaitEvent(uint32_t stream_id) override; + void WaitEventWithoutReset() override; + void WaitEventWithoutReset(uint32_t stream_id) override; + + void ResetEvent() override; + void ResetEvent(uint32_t stream_id) override; + + void RecordEvent() override; + void RecordEvent(uint32_t stream_id) override; + bool NeedWait() override; + void SyncEvent() override; + bool QueryEvent() override; + void ElapsedTime(float *cost_time, const DeviceEvent *other) override; + bool DestroyEvent() override; + void set_wait_stream(aclrtStream wait_stream) override { wait_stream_ = wait_stream; } + void set_record_stream(aclrtStream record_stream) override { record_stream_ = record_stream; } + + protected: + aclrtEvent event_{nullptr}; + aclrtStream wait_stream_{nullptr}; + aclrtStream record_stream_{nullptr}; + bool need_wait_{false}; + bool event_destroyed_{false}; + bool has_flag_{false}; +}; + +class HARDWARE_EXPORT AscendTimeEvent : public AscendEvent { + public: + AscendTimeEvent(); + ~AscendTimeEvent() override = default; +}; +} // namespace mindspore::device::ascend +#endif // MINDSPORE_ASCEND_EVENT_H diff --git a/inferrt/src/hardware/ascend/res_manager/ascend_hal_manager.cc b/inferrt/src/hardware/ascend/res_manager/ascend_hal_manager.cc new file mode 100644 index 0000000000000000000000000000000000000000..ffd7129ddc0d2c7ac1a6e4c5904a3dafcd08118f --- /dev/null +++ b/inferrt/src/hardware/ascend/res_manager/ascend_hal_manager.cc @@ -0,0 +1,209 @@ +/** + * Copyright 2025 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "hardware/ascend/res_manager/ascend_hal_manager.h" + +#include +#include +#include +#include "common/common.h" +#include "acl/acl_rt.h" + +#include "hardware/ascend/res_manager/symbol_interface/acl_rt_symbol.h" +#include "hardware/ascend/res_manager/symbol_interface/acl_symbol.h" +#include "hardware/ascend/res_manager/symbol_interface/symbol_utils.h" + +namespace mindspore { +namespace device { +namespace ascend { +namespace { +constexpr auto kSaturationMode = "Saturation"; +constexpr auto kINFNANMode = "INFNAN"; + +// std::string GenerateAclInitJsonPath() { +// const pid_t pid = getpid(); +// std::string rankid_str = common::GetEnv("RANK_ID"); +// if (mindspore::DistributedMeta::GetInstance()->initialized()) { +// rankid_str = std::to_string(mindspore::DistributedMeta::GetInstance()->global_rank_id()); +// } +// constexpr size_t random_len = 12; +// auto rand_str = Common::GetRandomStr(random_len); +// return "/tmp/aclinit_" + rankid_str + "_" + std::to_string(pid) + "_" + rand_str + ".json"; +// } +} // namespace +static thread_local aclrtContext thread_local_rt_context{nullptr}; + +AscendHalManager AscendHalManager::instance_{}; +AscendHalManager &AscendHalManager::GetInstance() { return instance_; } + +void AscendHalManager::InitDevice(uint32_t device_id) { + LOG_OUT << "Enter SetRtDevice, current initialize device number:" << initialized_device_set_.size(); + if (initialized_device_set_.find(device_id) != initialized_device_set_.end()) { + LOG_OUT << "Device " << device_id << " has been set"; + return; + } + + auto ret = CALL_ASCEND_API(aclrtSetDevice, UintToInt(device_id)); + if (ret != ACL_SUCCESS) { + auto device_count = GetDeviceCount(); + LOG_ERROR << "Call aclrtSetDevice failed, ret[" << static_cast(ret) << "]. Got device count[" << device_count + << "] and device id[" << device_id << "], please check if device id is valid."; + } + + aclrtContext rt_context; + ret = CALL_ASCEND_API(aclrtGetCurrentContext, &rt_context); + if (ret != ACL_SUCCESS || rt_context == nullptr) { + LOG_ERROR << "Call aclrtGetCurrentContext failed, ret[" << ret << "]"; + return; + } + + default_device_context_map_[device_id] = rt_context; + (void)initialized_device_set_.insert(device_id); +} + +void AscendHalManager::ResetDevice(uint32_t device_id) { + if (initialized_device_set_.find(device_id) != initialized_device_set_.end()) { + auto ret = CALL_ASCEND_API(aclrtResetDevice, UintToInt(device_id)); + if (ret != ACL_SUCCESS) { + LOG_ERROR << "Call aclrtResetDevice, ret[" << ret << "]"; + } + default_device_context_map_[device_id] = nullptr; + (void)initialized_device_set_.erase(device_id); + } +} + +uint32_t AscendHalManager::GetDeviceCount() { + uint32_t device_count = 0; + auto ret = CALL_ASCEND_API(aclrtGetDeviceCount, &device_count); + if (ret != ACL_SUCCESS) { + LOG_ERROR << "Call rtGetDeviceCount, ret[" << static_cast(ret) << "]"; + } + return device_count; +} + +void AscendHalManager::SetDeviceSatMode(const aclrtFloatOverflowMode &overflow_mode) { + auto overflow_mode_str = + (overflow_mode == aclrtFloatOverflowMode::ACL_RT_OVERFLOW_MODE_SATURATION) ? kSaturationMode : kINFNANMode; + LOG_OUT << "The current overflow detection mode is " << overflow_mode_str << "."; + auto ret = CALL_ASCEND_API(aclrtSetDeviceSatMode, overflow_mode); + if (ret != ACL_SUCCESS) { + LOG_ERROR << "Set " << overflow_mode_str << " mode failed."; + } +} + +void AscendHalManager::SetOpWaitTimeout(uint32_t op_wait_timeout) { + LOG_OUT << "Set op wait timeout: " << op_wait_timeout << " s"; + auto acl_ret = CALL_ASCEND_API(aclrtSetOpWaitTimeout, op_wait_timeout); + if (acl_ret != ACL_SUCCESS) { + LOG_ERROR << "Set op wait timeout failed, error: " << acl_ret; + } +} + +void AscendHalManager::SetOpExecuteTimeOut(uint32_t op_execute_timeout) { + LOG_OUT << "Set op execute timeout: " << op_execute_timeout << " s"; + auto acl_ret = CALL_ASCEND_API(aclrtSetOpExecuteTimeOut, op_execute_timeout); + if (acl_ret != ACL_SUCCESS) { + LOG_ERROR << "Set op execute timeout failed, error: " << acl_ret; + } +} + +aclrtContext AscendHalManager::CreateContext(uint32_t device_id) { + aclrtContext rt_context; + auto ret = CALL_ASCEND_API(aclrtCreateContext, &rt_context, device_id); + if (ret != ACL_SUCCESS) { + LOG_ERROR << "Call aclrtCreateContext failed, ret: " << ret; + } + rt_contexts_.insert(rt_context); + return rt_context; +} + +void AscendHalManager::ResetContext(uint32_t device_id) { + aclrtContext rt_context = CreateContext(device_id); + default_device_context_map_[device_id] = rt_context; +} + +void AscendHalManager::DestroyContext(aclrtContext context) { + auto ret = CALL_ASCEND_API(aclrtDestroyContext, context); + if (ret != ACL_SUCCESS) { + LOG_ERROR << "Failed to destroy context, ret = " << ret << "."; + } + rt_contexts_.erase(context); +} + +void AscendHalManager::DestroyAllContext() { + for (auto context : rt_contexts_) { + auto ret = CALL_ASCEND_API(aclrtDestroyContext, context); + if (ret != ACL_SUCCESS) { + LOG_ERROR << "Failed to destroy context, ret = " << ret << "."; + } + } + rt_contexts_.clear(); +} + +void AscendHalManager::SetContextForce(uint32_t device_id) { + if (default_device_context_map_[device_id] == nullptr) { + return; + } + auto ret = CALL_ASCEND_API(aclrtSetCurrentContext, default_device_context_map_[device_id]); + if (ret != ACL_SUCCESS) { + LOG_ERROR << "Call aclrtSetCurrentContext, ret[" << ret << "]"; + } +} + +void AscendHalManager::SetContext(uint32_t device_id) { + if (default_device_context_map_[device_id] == nullptr) { + return; + } + if (thread_local_rt_context == default_device_context_map_[device_id]) { + return; + } + auto ret = CALL_ASCEND_API(aclrtSetCurrentContext, default_device_context_map_[device_id]); + if (ret != ACL_SUCCESS) { + LOG_ERROR << "Call aclrtSetCurrentContext, ret[" << ret << "]"; + } + thread_local_rt_context = default_device_context_map_[device_id]; +} + +void AscendHalManager::InitializeAcl() { + // std::lock_guard lock(acl_init_mutex_); + // if (acl_initialized_) { + // return; + // } + // acl_initialized_ = true; + // std::string file_name = GenerateAclInitJsonPath(); + // std::string json_str; + // auto realpath = Common::CreatePrefixPath(file_name); + // if (!realpath.has_value()) { + // MS_LOG(WARNING) << "Failed to get real path: [" << file_name << "] in generate aclInit json file path."; + // return; + // } + // if (!OpDebugConf::GetInstance()->GenerateAclInitJson(realpath.value(), &json_str)) { + // MS_LOG(WARNING) << "Failed to generate aclinit json, the file path is " << realpath.value() << "."; + // return; + // } + // aclError ret = CALL_ASCEND_API(aclInit, realpath.value().c_str()); + // TempFileManager::GetInstance().RemoveFile(realpath.value()); + // if (ret != ACL_SUCCESS) { + // MS_LOG(WARNING) << "Call aclInit failed, the error number is " << ret << ", json is " << json_str; + // } else { + // MS_LOG(INFO) << "Call aclInit successfully, json is " << json_str; + // } +} + +bool AscendHalManager::EnableLccl() { return false; } +} // namespace ascend +} // namespace device +} // namespace mindspore diff --git a/inferrt/src/hardware/ascend/res_manager/ascend_hal_manager.h b/inferrt/src/hardware/ascend/res_manager/ascend_hal_manager.h new file mode 100644 index 0000000000000000000000000000000000000000..c6722bac702c1bbffd6492b0b84de8f838c1e8fb --- /dev/null +++ b/inferrt/src/hardware/ascend/res_manager/ascend_hal_manager.h @@ -0,0 +1,72 @@ +/** + * Copyright 2025 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_RES_MANAGER_ASCEND_HAL_MANAGER_ASCEND_HAL_MANAGER_H_ +#define MINDSPORE_CCSRC_PLUGIN_RES_MANAGER_ASCEND_HAL_MANAGER_ASCEND_HAL_MANAGER_H_ + +#include +#include +#include +#include "acl/acl_rt.h" +#include "hardware/hardware_abstract/visible.h" + +namespace mindspore { +namespace device { +namespace ascend { +class HARDWARE_EXPORT AscendHalManager { + public: + static AscendHalManager &GetInstance(); + + ~AscendHalManager() {} + // init + + // device + uint32_t GetDeviceCount(); + void InitDevice(uint32_t device_id); + void ResetDevice(uint32_t device_id); + void SetDeviceSatMode(const aclrtFloatOverflowMode &overflow_mode); + void SetOpWaitTimeout(uint32_t op_wait_timeout); + void SetOpExecuteTimeOut(uint32_t op_execute_timeout); + void InitializeAcl(); + bool EnableLccl(); + + // context + aclrtContext CreateContext(uint32_t device_id); + // reset the default context of device_id + void ResetContext(uint32_t device_id); + void SetContext(uint32_t device_id); + void SetContextForce(uint32_t device_id); + void DestroyContext(aclrtContext context); + void DestroyAllContext(); + + private: + static AscendHalManager instance_; + std::set initialized_device_set_{}; + // default pair + std::map default_device_context_map_; + + // rt_contexts by aclrtCreateContext, to destroy + std::set rt_contexts_; + + bool acl_initialized_ = false; + std::mutex acl_init_mutex_; +}; + +} // namespace ascend +} // namespace device +} // namespace mindspore + +#endif // MINDSPORE_CCSRC_PLUGIN_RES_MANAGER_ASCEND_HAL_MANAGER_ASCEND_HAL_MANAGER_H_ diff --git a/inferrt/src/hardware/ascend/res_manager/ascend_res_manager.cc b/inferrt/src/hardware/ascend/res_manager/ascend_res_manager.cc new file mode 100644 index 0000000000000000000000000000000000000000..72ac05495a0d488054a939819a29f74994c0e535 --- /dev/null +++ b/inferrt/src/hardware/ascend/res_manager/ascend_res_manager.cc @@ -0,0 +1,461 @@ +/** + * Copyright 2025 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "hardware/ascend/res_manager/ascend_res_manager.h" +#ifndef _WIN32 +#include +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include "hardware/hardware_abstract/dlopen_macro.h" +#include "hardware/ascend/res_manager/mem_manager/ascend_memory_manager.h" +#include "hardware/ascend/res_manager/mem_manager/ascend_vmm_adapter.h" +#include "hardware/ascend/res_manager/ascend_event.h" +#include "hardware/ascend/res_manager/symbol_interface/acl_compiler_symbol.h" +#include "hardware/ascend/res_manager/symbol_interface/acl_rt_symbol.h" +#include "hardware/ascend/res_manager/symbol_interface/symbol_utils.h" +#include "acl/acl_rt.h" +#include "hardware/hardware_abstract/device_context.h" +#include "hardware/hardware_abstract/device_context_manager.h" +#include "hardware/ascend/res_manager/ascend_hal_manager.h" +#include "common/common.h" + +namespace mindspore { +namespace device { +namespace ascend { +namespace { +constexpr uint32_t kDefaultHcclExecTimeout = 1800; + +using Callback = std::function; +std::mutex set_opt_mutex; + +void AclrtLaunchCallback(void *user_data) { + Callback *callback_func = reinterpret_cast(user_data); + (*callback_func)(); + delete callback_func; +} +} // namespace + +void AscendResManager::Initialize() { + // use 0 temporarily. + device_id_ = 0; + if (initialized_) { + AscendHalManager::GetInstance().SetContextForce(device_id_); + return; + } + // init device + AscendHalManager::GetInstance().InitDevice(device_id_); + AscendStreamMng::GetInstance().CreateDefaultStream(); + mem_manager_ = std::make_shared(); + CHECK_IF_NULL(mem_manager_); + mem_manager_->Initialize(); + initialized_ = true; +} + +void AscendResManager::Destroy() { + if (!initialized_) { + AscendHalManager::GetInstance().SetContextForce(device_id_); + return; + } + // To avoid call aclrtProcessReport after process exit, we should to clear all callback threads first. + AscendStreamMng::GetInstance().Clear(); + + (void)DestroyAllEvents(); + + AscendStreamMng::GetInstance().DestroyAllRtEvents(); + if (!AscendStreamMng::GetInstance().DestroyAllStreams()) { + LOG_ERROR << "Fail to destroy all streams when reset device."; + } + // Release memory. + if (mem_manager_ != nullptr) { + mem_manager_->Finalize(); + mem_manager_ = nullptr; + } + + // All unmap/free operations will fail after calling aclrtResetDevice in ResetDevice, + // so it must be called before that. + AscendVmmAdapter::GetInstance().ClearAllMemory(); + AscendHalManager::GetInstance().ResetDevice(device_id_); + + initialized_ = false; +} + +bool AscendResManager::IsEnableVmm() const { return AscendVmmAdapter::GetInstance().IsEnabled(); } + +void *AscendResManager::AllocateMemory(size_t size, uint32_t stream_id) const { + AscendHalManager::GetInstance().SetContext(device_id_); + CHECK_IF_NULL(mem_manager_); + return mem_manager_->MallocMemFromMemPool(size, false, false, stream_id); +} + +void *AscendResManager::AllocateStaticMemory(size_t size, uint32_t stream_id) const { + AscendHalManager::GetInstance().SetContext(device_id_); + return mem_manager_->MallocMemFromMemPool(size, true, false, stream_id); +} + +size_t AscendResManager::GetMaxUsedMemorySize() const { + CHECK_IF_NULL(mem_manager_); + return mem_manager_->GetMaxUsedMemorySize(); +} + +void AscendResManager::FreeMemory(void *ptr) const { + CHECK_IF_NULL(ptr); + CHECK_IF_NULL(mem_manager_); + mem_manager_->FreeMemFromMemPool(ptr); +} + +void AscendResManager::FreePartMemorys(const std::vector &free_addrs, const std::vector &keep_addrs, + const std::vector &keep_addr_sizes) const { + AscendMemoryPool::GetInstance().FreePartTensorMems(free_addrs, keep_addrs, keep_addr_sizes); +} + +void AscendResManager::DefragMemory() { AscendMemoryPool::GetInstance().DefragMemory(); } + +// Relevant function to manage memory statistics +size_t AscendResManager::GetTotalMemStatistics() const { + CHECK_IF_NULL(mem_manager_); + return mem_manager_->GetTotalMemStatistics(); +} + +size_t AscendResManager::GetTotalUsedMemStatistics() const { + CHECK_IF_NULL(mem_manager_); + return mem_manager_->GetTotalUsedMemStatistics(); +} + +size_t AscendResManager::GetTotalIdleMemStatistics() const { + CHECK_IF_NULL(mem_manager_); + return mem_manager_->GetTotalIdleMemStatistics(); +} + +size_t AscendResManager::GetTotalEagerFreeMemStatistics() const { + CHECK_IF_NULL(mem_manager_); + return mem_manager_->GetTotalEagerFreeMemStatistics(); +} + +size_t AscendResManager::GetUsedMemPeakStatistics() const { + CHECK_IF_NULL(mem_manager_); + return mem_manager_->GetUsedMemPeakStatistics(); +} + +size_t AscendResManager::GetReservedMemPeakStatistics() const { + CHECK_IF_NULL(mem_manager_); + return mem_manager_->GetReservedMemPeakStatistics(); +} + +std::unordered_map AscendResManager::GetBlockCountsStatistics() const { + CHECK_IF_NULL(mem_manager_); + return mem_manager_->GetBlockCountsStatistics(); +} + +std::unordered_map AscendResManager::GetBlockUnitSizeStatistics() const { + CHECK_IF_NULL(mem_manager_); + return mem_manager_->GetBlockUnitSizeStatistics(); +} + +DeviceMemInfo AscendResManager::GetCommonMemBlocksInfoStatistics() const { + CHECK_IF_NULL(mem_manager_); + return mem_manager_->GetCommonMemBlocksInfoStatistics(); +} + +DeviceMemInfo AscendResManager::GetPersistentMemBlocksInfoStatistics() const { + CHECK_IF_NULL(mem_manager_); + return mem_manager_->GetPersistentMemBlocksInfoStatistics(); +} + +void AscendResManager::ResetMaxMemoryReserved() { + CHECK_IF_NULL(mem_manager_); + auto memory_pool = mem_manager_->GetMemoryPool(); + CHECK_IF_NULL(memory_pool); + memory_pool->ResetMaxMemReserved(); +} + +void AscendResManager::ResetMaxMemoryAllocated() { + CHECK_IF_NULL(mem_manager_); + auto memory_pool = mem_manager_->GetMemoryPool(); + CHECK_IF_NULL(memory_pool); + memory_pool->ResetMaxMemAllocated(); +} + +size_t AscendResManager::EmptyCache() { + CHECK_IF_NULL(mem_manager_); + auto memory_pool = mem_manager_->GetMemoryPool(); + CHECK_IF_NULL(memory_pool); + return memory_pool->EmptyCache(); +} + +std::vector AscendResManager::AllocateContinuousMemory(const std::vector &size_list, + uint32_t stream_id) const { + AscendHalManager::GetInstance().SetContext(device_id_); + + CHECK_IF_NULL(mem_manager_); + std::vector aligned_size_list; + for (auto size : size_list) { + auto align_size = device::MemoryManager::GetCommonAlignSize(size); + aligned_size_list.emplace_back(align_size); + } + return mem_manager_->MallocContinuousMemFromMemPool(aligned_size_list, stream_id); +} + +bool AscendResManager::BindDeviceToCurrentThread(bool force_bind) const { + static thread_local std::once_flag is_set; + std::call_once(is_set, [this]() { + auto ret = CALL_ASCEND_API(aclrtSetDevice, static_cast(device_id_)); + if (ret != ACL_SUCCESS) { + LOG_ERROR << "Device " << device_id_ << " call aclrtSetDevice failed, ret:" << static_cast(ret); + } + }); + + if (force_bind) { + AscendHalManager::GetInstance().SetContextForce(device_id_); + } else { + AscendHalManager::GetInstance().SetContext(device_id_); + } + + return true; +} + +bool AscendResManager::CreateStream(size_t *stream_id) const { + if (!BindDeviceToCurrentThread(false)) { + LOG_ERROR << "Bind context to current thread failed"; + return false; + } + AscendStreamMng::GetInstance().CreateStream(stream_id); + return true; +} + +bool AscendResManager::CreateStreamWithPriority(size_t *stream_id, int32_t priority) const { + if (!BindDeviceToCurrentThread(false)) { + LOG_ERROR << "Bind context to current thread failed"; + return false; + } + AscendStreamMng::GetInstance().CreateStreamWithFlags(stream_id, ACL_STREAM_FAST_LAUNCH | ACL_STREAM_FAST_SYNC, + static_cast(priority)); + return true; +} + +bool AscendResManager::DestroyStream(size_t stream_id) const { + if (!BindDeviceToCurrentThread(false)) { + LOG_ERROR << "Bind context to current thread failed"; + return false; + } + AscendStreamMng::GetInstance().DestroyStream(stream_id); + return true; +} + +size_t AscendResManager::QueryStreamSize() const { return AscendStreamMng::GetInstance().QueryStreamSize(); } + +std::vector AscendResManager::GetStreamIds() const { return AscendStreamMng::GetInstance().GetStreamIds(); } + +bool AscendResManager::single_op_multi_stream_enable() const { + return AscendStreamMng::GetInstance().single_op_multi_stream_enable(); +} + +void AscendResManager::set_single_op_multi_stream_enable(bool single_op_multi_stream_enable) { + return AscendStreamMng::GetInstance().set_single_op_multi_stream_enable(single_op_multi_stream_enable); +} + +void *AscendResManager::GetStream(size_t stream_id) const { + if (!BindDeviceToCurrentThread(false)) { + LOG_ERROR << "Bind context to current thread failed"; + return nullptr; + } + return AscendStreamMng::GetInstance().GetStream(stream_id); +} + +void AscendResManager::SetCurrentStreamId(size_t stream_id) { + if (!BindDeviceToCurrentThread(false)) { + LOG_ERROR << "Bind context to current thread failed"; + return; + } + AscendStreamMng::GetInstance().set_current_stream(stream_id); +} + +size_t AscendResManager::GetCurrentStreamId() const { + if (!BindDeviceToCurrentThread(false)) { + LOG_ERROR << "Bind context to current thread failed"; + return SIZE_MAX; + } + return AscendStreamMng::GetInstance().current_stream(); +} + +bool AscendResManager::QueryStream(size_t stream_id) const { + if (!BindDeviceToCurrentThread(false)) { + LOG_ERROR << "Bind context to current thread failed"; + return false; + } + return AscendStreamMng::GetInstance().QueryStream(stream_id); +} + +bool AscendResManager::SyncStream(size_t stream_id) const { + if (!BindDeviceToCurrentThread(false)) { + LOG_ERROR << "Bind context to current thread failed"; + return false; + } + return AscendStreamMng::GetInstance().SyncStream(stream_id); +} + +bool AscendResManager::SyncAllStreams(bool sync_device) const { + AscendHalManager::GetInstance().SetContext(device_id_); + return AscendStreamMng::GetInstance().SyncAllStreams(sync_device); +} + +bool AscendResManager::SyncNotDefaultStreams() const { + if (!BindDeviceToCurrentThread(false)) { + LOG_ERROR << "Bind context to current thread failed"; + return false; + } + return AscendStreamMng::GetInstance().SyncNotDefaultStreams(); +} + +size_t AscendResManager::DefaultStream() const { + if (!BindDeviceToCurrentThread(false)) { + LOG_ERROR << "Bind context to current thread failed"; + return SIZE_MAX; + } + return AscendStreamMng::GetInstance().default_stream_id(); +} + +// ACL_EVENT_TIME_LINE: indicates that the number of created events is not limited, and the created events can be used +// to compute the elapsed time between events, which may cause lost some performance. +// ACL_EVENT_SYNC: indicates that the number of created events is limited, and the created events can be used for +// synchronization between multiple streams. +// ACL_EVENT_CAPTURE_STREAM_PROGRESS: indicates that the number of created events is not limited and high performance, +// and the created events can not be used for timing and synchronization. +DeviceEventPtr AscendResManager::CreateRuntimeEvent(bool enable_blocking, bool enable_record_wait) { + if (!enable_blocking && !enable_record_wait) { + LOG_ERROR << "Bad parameters, enable_blocking is false and enable_record_wait is false."; + } + + uint32_t flag = 0; + if (enable_blocking) { + flag |= ACL_EVENT_SYNC; + } + if (enable_record_wait) { + flag |= ACL_EVENT_CAPTURE_STREAM_PROGRESS; + } + return std::make_shared(flag); +} + +DeviceEventPtr AscendResManager::CreateEventWithFlag(bool enable_timing, bool blocking, bool use_extensional_api) { + auto flag = enable_timing ? (ACL_EVENT_TIME_LINE | ACL_EVENT_SYNC) : ACL_EVENT_SYNC; + auto event = std::make_shared(flag, use_extensional_api); + CHECK_IF_NULL(event); + std::lock_guard lock(device_events_mutex_); + device_events_.push_back(event); + return event; +} + +bool AscendResManager::DestroyEvent(const DeviceEventPtr &event) { + CHECK_IF_NULL(event); + if (!event->DestroyEvent()) { + LOG_ERROR << "Destroy Event failed."; + return false; + } + std::lock_guard lock(device_events_mutex_); + const auto &iter = std::find(device_events_.begin(), device_events_.end(), event); + if (iter == device_events_.end()) { + LOG_OUT << "Can't find specified device event."; + return false; + } + (void)device_events_.erase(iter); + return true; +} + +bool AscendResManager::DestroyAllEvents() { + DeviceEventPtrList device_events_inner; + { + std::lock_guard lock(device_events_mutex_); + device_events_inner = device_events_; + device_events_.clear(); + } + (void)std::for_each(device_events_inner.begin(), device_events_inner.end(), [this](const auto &event) { + CHECK_IF_NULL(event); + if (!event->DestroyEvent()) { + LOG_ERROR << "Destroy Event failed."; + } + }); + device_events_.clear(); + return true; +} + +void *AscendResManager::GetCopyDataStream() const { + auto copy_out_data_stream = AscendStreamMng::GetInstance().GetCopyOutStream(); + if (copy_out_data_stream == nullptr) { + size_t copy_stream_id; + AscendStreamMng::GetInstance().CreateStream(©_stream_id); + LOG_OUT << "Create ascend copy data stream, stream id: " << copy_stream_id; + copy_out_data_stream = AscendStreamMng::GetInstance().GetStream(copy_stream_id); + AscendStreamMng::GetInstance().SetCopyOutStream(copy_out_data_stream); + } + return copy_out_data_stream; +} + +bool AscendResManager::RecordEvent(int64_t task_id_on_stream, uint32_t user_stream_id, + const std::vector> &memory_stream_addresses, + const DeviceEventPtr &input_event) { + return mem_manager_->RecordEvent(task_id_on_stream, user_stream_id, memory_stream_addresses, input_event); +} + +bool AscendResManager::WaitEvent(int64_t task_id_on_stream, uint32_t user_stream_id, uint32_t memory_stream_id) { + return mem_manager_->WaitEvent(task_id_on_stream, user_stream_id, memory_stream_id); +} + +bool AscendResManager::WaitEvent(int64_t task_id_on_stream, uint32_t user_stream_id) { + return mem_manager_->WaitEvent(task_id_on_stream, user_stream_id); +} + +bool AscendResManager::SyncAllEvents() { return mem_manager_->SyncAllEvents(); } + +bool AscendResManager::LaunchCallback(std::function callback_func, size_t stream_id, bool is_block) const { + auto stream = AscendStreamMng::GetInstance().GetStream(stream_id); + if (stream == nullptr) { + stream = AscendStreamMng::GetInstance().default_stream(); + } + CHECK_IF_NULL(stream); + auto block_type = + is_block ? aclrtCallbackBlockType::ACL_CALLBACK_BLOCK : aclrtCallbackBlockType::ACL_CALLBACK_NO_BLOCK; + auto callback_func_ptr = new Callback(callback_func); + aclError ret = CALL_ASCEND_API(aclrtLaunchCallback, AclrtLaunchCallback, callback_func_ptr, block_type, stream); + LOG_OUT << "Launch callback for stream_id : " << stream_id << ", ret : " << ret << "."; + if (ret) { + delete callback_func_ptr; + LOG_ERROR << "Launch callback for stream_id : " << stream_id << " failed, ret : " << ret << "."; + if (SyncStream(stream_id)) { + callback_func(); + return true; + } + + ResetStreamAndCtx(); + return false; + } + return true; +} + +void AscendResManager::ResetStreamAndCtx() const { + AscendStreamMng::GetInstance().DestroyAllStreams(); + AscendHalManager::GetInstance().ResetContext(device_id_); + AscendStreamMng::GetInstance().CreateDefaultStream(); +} + +} // namespace ascend +} // namespace device +} // namespace mindspore diff --git a/inferrt/src/hardware/ascend/res_manager/ascend_res_manager.h b/inferrt/src/hardware/ascend/res_manager/ascend_res_manager.h new file mode 100644 index 0000000000000000000000000000000000000000..a75ad983972e905337140eed217318e868aa2be5 --- /dev/null +++ b/inferrt/src/hardware/ascend/res_manager/ascend_res_manager.h @@ -0,0 +1,133 @@ +/** + * Copyright 2025 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_CCSRC_PLUGIN_RES_MANAGER_ASCEND_ASCEND_RES_MANAGER_H_ +#define MINDSPORE_CCSRC_PLUGIN_RES_MANAGER_ASCEND_ASCEND_RES_MANAGER_H_ + +#include +#include +#include +#include +#include +#include +#include "acl/acl_rt.h" +#include "hardware/ascend/res_manager/ascend_stream_manager.h" +#include "hardware/hardware_abstract/device_event.h" +#include "hardware/hardware_abstract/device_context.h" +#include "hardware/hardware_abstract/visible.h" + +namespace mindspore { +namespace device { +namespace ascend { +std::string GetCurrentDir(); + +using DeviceMemInfo = std::unordered_map>; +class HARDWARE_EXPORT AscendResManager : public DeviceResManager { + public: + AscendResManager() = default; + ~AscendResManager() override = default; + + void Initialize() override; + + void Destroy() override; + + std::shared_ptr mem_manager() const override { return mem_manager_; } + + std::vector AllocateContinuousMemory(const std::vector &size_list, + uint32_t stream_id = kDefaultStreamIndex) const override; + bool IsEnableVmm() const override; + + bool BindDeviceToCurrentThread(bool force_bind) const override; + void *GetStream() const override { return AscendStreamMng::GetInstance().default_stream(); } + void *GetCopyDataStream() const; + + void *AllocateStaticMemory(size_t size, uint32_t stream_id = kDefaultStreamIndex) const; + void *AllocateMemory(size_t size, uint32_t stream_id = kDefaultStreamIndex) const override; + void FreeMemory(void *ptr) const override; + void FreePartMemorys(const std::vector &free_addrs, const std::vector &keep_addrs, + const std::vector &keep_addr_sizes) const override; + void DefragMemory() override; + + size_t GetMaxUsedMemorySize() const override; + + // Relevant function to manage memory statistics + size_t GetTotalMemStatistics() const override; + size_t GetTotalUsedMemStatistics() const override; + size_t GetTotalIdleMemStatistics() const override; + size_t GetTotalEagerFreeMemStatistics() const override; + size_t GetUsedMemPeakStatistics() const override; + size_t GetReservedMemPeakStatistics() const override; + std::unordered_map GetBlockCountsStatistics() const override; + std::unordered_map GetBlockUnitSizeStatistics() const override; + std::unordered_map> GetCommonMemBlocksInfoStatistics() + const override; + std::unordered_map> + GetPersistentMemBlocksInfoStatistics() const override; + void ResetMaxMemoryReserved() override; + void ResetMaxMemoryAllocated() override; + + size_t EmptyCache() override; + + bool CreateStream(size_t *stream_id) const override; + bool CreateStreamWithPriority(size_t *stream_id, int32_t priority) const override; + bool DestroyStream(size_t stream_id) const override; + size_t QueryStreamSize() const override; + std::vector GetStreamIds() const override; + void *GetStream(size_t stream_id) const override; + void SetCurrentStreamId(size_t stream_id) override; + size_t GetCurrentStreamId() const override; + bool QueryStream(size_t stream_id) const override; + bool SyncStream(size_t stream_id = 0) const override; + bool SyncAllStreams(bool sync_device = true) const override; + bool SyncNotDefaultStreams() const override; + size_t DefaultStream() const override; + + DeviceEventPtr CreateRuntimeEvent(bool enable_blocking, bool enable_record_wait) override; + DeviceEventPtr CreateEventWithFlag(bool enable_timing, bool blocking, bool use_extensional_api) override; + bool DestroyEvent(const DeviceEventPtr &event) override; + bool DestroyAllEvents() override; + + bool single_op_multi_stream_enable() const override; + void set_single_op_multi_stream_enable(bool single_op_multi_stream_enable) override; + // Only used in graph_mode with MS_DISABLE_REF_MODE, delete it when delete MS_DISABLE_REF_MODEF + void SetCPUMemManager(); + + // Override interface for multi stream event control. + bool RecordEvent(int64_t task_id_on_stream, uint32_t user_stream_id, + const std::vector> &memory_stream_addresses, + const DeviceEventPtr &input_event) override; + + bool WaitEvent(int64_t task_id_on_stream, uint32_t user_stream_id, uint32_t memory_stream_id) override; + + bool WaitEvent(int64_t task_id_on_stream, uint32_t user_stream_id) override; + + bool SyncAllEvents() override; + + bool LaunchCallback(std::function callback_func, size_t stream_id, bool is_block = false) const override; + + void ResetStreamAndCtx() const override; + + private: + bool initialized_ = false; + std::shared_ptr mem_manager_{nullptr}; + DeviceEventPtrList device_events_{}; + std::mutex device_events_mutex_; + uint32_t device_id_{0}; + bool enable_memory_tracker_{false}; +}; +} // namespace ascend +} // namespace device +} // namespace mindspore +#endif // MINDSPORE_CCSRC_PLUGIN_RES_MANAGER_ASCEND_ASCEND_RES_MANAGER_H_ diff --git a/inferrt/src/hardware/ascend/res_manager/ascend_stream_manager.cc b/inferrt/src/hardware/ascend/res_manager/ascend_stream_manager.cc new file mode 100644 index 0000000000000000000000000000000000000000..ce6aeb19196527ab2dec390a35ee38c494a8cabe --- /dev/null +++ b/inferrt/src/hardware/ascend/res_manager/ascend_stream_manager.cc @@ -0,0 +1,408 @@ +/** + * Copyright 2022-2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "hardware/ascend/res_manager/ascend_stream_manager.h" + +#include +#include "common/common.h" +#include "hardware/hardware_abstract/common.h" +#include "acl/error_codes/rt_error_codes.h" +#include "hardware/ascend/res_manager/mem_manager/ascend_gmem_adapter.h" +#include "hardware/ascend/res_manager/symbol_interface/acl_rt_symbol.h" +#include "hardware/ascend/res_manager/symbol_interface/symbol_utils.h" + +namespace mindspore { +namespace device { +namespace ascend { +namespace { +constexpr size_t kIndex0 = 0; +} +AscendStreamMng &AscendStreamMng::GetInstance() { + static AscendStreamMng instance{}; + return instance; +} + +void AscendStreamMng::DestroyAllRtEvents() { + for (size_t i = 0; i < events_.size(); ++i) { + if (events_[i] != nullptr) { + auto rt_ret = CALL_ASCEND_API(aclrtDestroyEvent, events_[i]); + if (rt_ret != ACL_SUCCESS) { + LOG_ERROR << "Call aclrtDestroyEvent failed, ret:" << rt_ret; + } + } + } + events_.clear(); +} + +void AscendStreamMng::DeleteEvent() { + if (cur_event_num_ == 0) { + LOG_OUT << "total event num is 0, no event to delete"; + } else { + --cur_event_num_; + } +} + +void AscendStreamMng::DeleteStream() { + if (cur_stream_num_ == 0) { + LOG_OUT << " total stream num is 0, no stream to delete"; + } else { + --cur_stream_num_; + } +} + +uint32_t AscendStreamMng::GetCurAllocStreamId() const { + if (cur_stream_num_ == 0) { + LOG_ERROR << "stream nums is 0, no stream id should be get"; + } + return cur_stream_num_ - 1; +} + +void AscendStreamMng::CreateStream(aclrtStream *stream, int32_t priority) { + std::lock_guard lock_streams(stream_mutex_); + auto ret = CALL_ASCEND_API(aclrtCreateStreamWithConfig, stream, IntToUint(priority), + (ACL_STREAM_FAST_LAUNCH | ACL_STREAM_FAST_SYNC)); + if (ret != ACL_SUCCESS) { + LOG_ERROR << "Create stream failed, ret:" << ret; + } + ret = CALL_ASCEND_API(aclrtSetStreamFailureMode, *stream, ACL_STOP_ON_FAILURE); + if (ret != ACL_SUCCESS) { + LOG_ERROR << "aclrtSetStreamFailureMode failed, ret:" << ret; + } + (void)streams_.emplace_back(*stream); +} + +void AscendStreamMng::CreateStream(size_t *stream_id, int32_t priority) { + std::lock_guard lock_streams(stream_mutex_); + aclrtStream stream; + auto ret = CALL_ASCEND_API(aclrtCreateStreamWithConfig, &stream, IntToUint(priority), + (ACL_STREAM_FAST_LAUNCH | ACL_STREAM_FAST_SYNC)); + if (ret != ACL_SUCCESS) { + LOG_ERROR << "Create stream failed, ret:" << ret; + } + ret = CALL_ASCEND_API(aclrtSetStreamFailureMode, stream, ACL_STOP_ON_FAILURE); + if (ret != ACL_SUCCESS) { + LOG_ERROR << "aclrtSetStreamFailureMode failed, ret:" << ret; + } + *stream_id = streams_.size(); + (void)streams_.emplace_back(stream); +} + +void AscendStreamMng::CreateStreamWithFlags(aclrtStream *stream, uint32_t flags, int32_t priority) { + std::lock_guard lock_streams(stream_mutex_); + auto ret = CALL_ASCEND_API(aclrtCreateStreamWithConfig, stream, IntToUint(priority), flags); + if (ret != ACL_SUCCESS) { + LOG_ERROR << "Create stream failed, ret:" << ret; + } + ret = CALL_ASCEND_API(aclrtSetStreamFailureMode, *stream, ACL_STOP_ON_FAILURE); + if (ret != ACL_SUCCESS) { + LOG_ERROR << "aclrtSetStreamFailureMode failed, ret:" << ret; + } + (void)streams_.emplace_back(*stream); +} + +void AscendStreamMng::CreateStreamWithFlags(size_t *stream_id, uint32_t flags, int32_t priority) { + std::lock_guard lock_streams(stream_mutex_); + aclrtStream stream; + auto ret = CALL_ASCEND_API(aclrtCreateStreamWithConfig, &stream, IntToUint(priority), flags); + if (ret != ACL_SUCCESS) { + LOG_ERROR << "Create stream failed, ret:" << ret; + } + ret = CALL_ASCEND_API(aclrtSetStreamFailureMode, stream, ACL_STOP_ON_FAILURE); + if (ret != ACL_SUCCESS) { + LOG_ERROR << "aclrtSetStreamFailureMode failed, ret:" << ret; + } + *stream_id = streams_.size(); + (void)streams_.emplace_back(stream); +} + +aclrtEvent AscendStreamMng::ApplyRtEvent() { + aclrtEvent rt_event = nullptr; + // Use ex api of event, so that no limits on event total size. + uint32_t flag = ACL_EVENT_SYNC; + auto ret = CALL_ASCEND_API(aclrtCreateEventExWithFlag, &rt_event, flag); + if (ret != ACL_SUCCESS) { + LOG_ERROR << "aclrtCreateEventExWithFlag failed, ret : " << ret << "."; + } + (void)events_.emplace_back(rt_event); + return rt_event; +} + +bool AscendStreamMng::DestroyStream(size_t stream_id) { + std::lock_guard lock_streams(stream_mutex_); + if (stream_id >= streams_.size()) { + LOG_ERROR << "Ascend stream not found for stream id " << stream_id; + return false; + } + if (streams_.at(stream_id) == nullptr) { + LOG_OUT << "Ascend stream hsa been destroyed for stream id " << stream_id; + return true; + } + const auto ret = CALL_ASCEND_API(aclrtDestroyStream, streams_.at(stream_id)); + if (ret != ACL_SUCCESS) { + LOG_ERROR << "Call aclrtDestroyStream, ret[" << ret << "]"; + } + streams_[stream_id] = nullptr; + if (communication_stream_id_ == stream_id) { + communication_stream_ = nullptr; + } + if (default_stream_id_ == stream_id) { + default_stream_ = nullptr; + } + + return true; +} + +bool AscendStreamMng::ForceDestroyAllStreams() { + std::lock_guard lock_streams(stream_mutex_); + for (const auto &stream : streams_) { + if (stream == nullptr) { + continue; + } + const auto ret = CALL_ASCEND_API(aclrtDestroyStreamForce, stream); + if (ret != ACL_SUCCESS) { + LOG_ERROR << "Call aclrtDestroyStream, ret[" << ret << "]"; + } + } + streams_.clear(); + default_stream_ = nullptr; + communication_stream_ = nullptr; + return true; +} + +bool AscendStreamMng::DestroyAllStreams() { + std::lock_guard lock_streams(stream_mutex_); + for (const auto &stream : streams_) { + if (stream == nullptr) { + continue; + } + const auto ret = CALL_ASCEND_API(aclrtDestroyStream, stream); + if (ret != ACL_SUCCESS) { + LOG_ERROR << "Call aclrtDestroyStream, ret[" << ret << "]"; + } + } + streams_.clear(); + default_stream_ = nullptr; + communication_stream_ = nullptr; + return true; +} + +aclrtStream AscendStreamMng::GetStream(size_t stream_id) const { + if (stream_id >= streams_.size()) { + LOG_OUT << "Stream for stream id[" << stream_id << "] not found, return nullptr."; + return nullptr; + } + return streams_[stream_id]; +} + +bool AscendStreamMng::SyncStream(size_t stream_id) const { + if (stream_id >= streams_.size()) { + LOG_ERROR << "Stream for stream id[" << stream_id << "] has not been created."; + } + const auto stream = streams_[stream_id]; + if (stream == nullptr) { + LOG_OUT << "Stream for stream id[" << stream_id << "] has been destroyed."; + return false; + } + return SyncStream(stream); +} + +bool AscendStreamMng::SyncStream(aclrtStream stream) const { + CHECK_IF_NULL(stream); + LOG_OUT << "Sync stream: " << stream; + auto RET = ACL_SUCCESS; + try { + GilReleaseWithCheck gil_release; + RET = CALL_ASCEND_API(aclrtSynchronizeStreamWithTimeout, stream, -1); + if (RET != ACL_SUCCESS && RET != ACL_ERROR_RT_AICORE_OVER_FLOW) { // o for switch stream + LOG_ERROR << "Call runtime aclrtSynchronizeStreamWithTimeout error." + << "Please do the following three things to confirm whether it is caused by the " + << "execution failure of a certain operator.\n" + << " 1.Set mindspore.runtime.launch_blocking() at the beginning of your python script.\n" + << " 2.Run again your python script.\n" + << " 3.Grep 'Sync run failed' in your logs, it always stays at the end of your logs.\n" + << "Now you will get the certain failed op detailed infos."; + return false; + } + } catch (const std::exception &e) { + LOG_ERROR << "Sync stream failed. " << e.what() + << "Please do the following three things to confirm whether it is caused by the " + << "execution failure of a certain operator.\n" + << " 1.Set mindspore.runtime.launch_blocking() at the beginning of your python script.\n" + << " 2.Run again your python script.\n" + << " 3.Grep 'Sync run failed' in your logs, it always stays at the end of your logs.\n" + << "Now you will get the certain failed op detailed infos."; + return false; + } + if (RET == ACL_ERROR_RT_AICORE_OVER_FLOW) { + LOG_OUT << "Call runtime aclrtSynchronizeStreamWithTimeout, the stream get overflow."; + } + return true; +} + +bool AscendStreamMng::SyncAllStreams(bool sync_device) const { + auto RET = ACL_ERROR_NONE; + try { + GilReleaseWithCheck gil_release; + if (sync_device) { + // According to CANN, we need to set timeout to 2 hours for aclrtSynchronizeDeviceWithTimeout. + int timeout = 7200000; + RET = CALL_ASCEND_API(aclrtSynchronizeDeviceWithTimeout, timeout); + if (RET != ACL_ERROR_NONE && RET != ACL_ERROR_RT_AICORE_OVER_FLOW) { + LOG_ERROR << "Call runtime aclrtSynchronizeDeviceWithTimeout error." + << "Please do the following three things to confirm whether it is caused by the " + << "execution failure of a certain operator.\n" + << " 1.Set mindspore.runtime.launch_blocking() at the beginning of your python script.\n" + << " 2.Run again your python script.\n" + << " 3.Grep 'Sync run failed' in your logs, it always stays at the end of your logs.\n" + << "Now you will get the certain failed op detailed infos."; + return false; + } + } else { + for (size_t i = 0; i < streams_.size(); i++) { + const auto stream = streams_[i]; + if (stream != nullptr && !SyncStream(stream)) { + LOG_ERROR << "SyncStream for stream id " << i << " failed."; + return false; + } + } + } + } catch (const std::exception &e) { + std::string sync_method = sync_device ? "aclrtSynchronizeDeviceWithTimeout" : "aclrtSynchronizeStreamWithTimeout"; + LOG_ERROR << sync_method << " failed. " << e.what() + << "Please do the following three things to confirm whether it is caused by the " + << "execution failure of a certain operator.\n" + << " 1.Set mindspore.runtime.launch_blocking() at the beginning of your python script.\n" + << " 2.Run again your python script.\n" + << " 3.Grep 'Sync run failed' in your logs, it always stays at the end of your logs.\n" + << "Now you will get the certain failed op detailed infos."; + return false; + } + if (RET == ACL_ERROR_RT_AICORE_OVER_FLOW) { + std::string sync_method = sync_device ? "aclrtSynchronizeDeviceWithTimeout" : "aclrtSynchronizeStreamWithTimeout"; + LOG_OUT << "Call runtime " << sync_method << ", the stream get overflow." + << "Please do the following three things to confirm whether it is caused by the " + << "execution failure of a certain operator.\n" + << " 1.Set mindspore.runtime.launch_blocking() at the beginning of your python script.\n" + << " 2.Run again your python script.\n" + << " 3.Grep 'Sync run failed' in your logs, it always stays at the end of your logs.\n" + << "Now you will get the certain failed op detailed infos."; + } + return true; +} + +bool AscendStreamMng::SyncNotDefaultStreams() const { + bool res = true; + for (size_t i = 0; i < streams_.size(); i++) { + if (i != default_stream_id_ && !SyncStream(i)) { + LOG_ERROR << "Failed to sync for ascend stream id: " << i; + res = false; + } + } + return res; +} + +bool AscendStreamMng::SyncExceptStreamsInList(const std::set &except_streams) const { + bool res = true; + for (size_t i = 0; i < streams_.size(); i++) { + if (except_streams.count(streams_[i]) > 0) { + LOG_OUT << "Stream id:" << i << " is been synchronized."; + continue; + } + if (!SyncStream(i)) { + LOG_ERROR << "Failed to sync for ascend stream id: " << i; + res = false; + } + } + return res; +} + +size_t AscendStreamMng::QueryStreamSize() const { return streams_.size(); } + +bool AscendStreamMng::QueryStream(size_t stream_id) { + if (stream_id >= streams_.size()) { + LOG_ERROR << "Stream for stream id[" << stream_id << "] has not been created."; + } + const auto stream = streams_[stream_id]; + if (stream == nullptr) { + LOG_OUT << "Stream for stream id[" << stream_id << "] has been destroyed."; + return false; + } + + aclrtStreamStatus status; + auto ret = CALL_ASCEND_API(aclrtStreamQuery, stream, &status); + if (ret != ACL_SUCCESS) { + LOG_ERROR << "Failed to query completion status for stream id: " << stream_id; + } + return status == ACL_STREAM_STATUS_COMPLETE; +} + +size_t AscendStreamMng::GetStreamId(void *stream_ptr) { + auto iter = std::find(streams_.begin(), streams_.end(), stream_ptr); + if (iter == streams_.end()) { + LOG_ERROR << "Failed to find stream_ptr in streams_, stream_ptr:" << stream_ptr; + } + + return LongToSize(std::distance(streams_.begin(), iter)); +} + +std::vector AscendStreamMng::GetStreamIds() const { + std::vector stream_ids; + for (size_t i = 0; i < streams_.size(); i++) { + if (streams_[i] != nullptr) { + (void)stream_ids.emplace_back(static_cast(i)); + } + } + return stream_ids; +} + +void AscendStreamMng::CreateDefaultStream() { + if (default_stream_ == nullptr) { + CreateStream(&default_stream_id_); + LOG_OUT << "Create ascend default stream, stream id: " << default_stream_id_; + default_stream_ = GetStream(default_stream_id_); + CHECK_IF_NULL(default_stream_); + } else { + LOG_OUT << "The default compute stream is already created, skip."; + } + + if (communication_stream_ == nullptr) { + CreateStream(&communication_stream_id_); + LOG_OUT << "Create ascend communication stream, stream id: " << communication_stream_id_; + communication_stream_ = GetStream(communication_stream_id_); + CHECK_IF_NULL(communication_stream_); + } else { + LOG_OUT << "The default communication stream is already created, skip."; + } +} + +size_t AscendStreamMng::default_stream_id() const { + if (default_stream_ == nullptr) { + LOG_ERROR << "The default stream is not created"; + } + return default_stream_id_; +} +size_t AscendStreamMng::communication_stream_id() const { + if (communication_stream_ == nullptr) { + LOG_ERROR << "The communication stream is not created"; + } + return communication_stream_id_; +} +aclrtStream AscendStreamMng::default_stream() const { return default_stream_; } +aclrtStream AscendStreamMng::communication_stream() const { return communication_stream_; } + +} // namespace ascend +} // namespace device +} // namespace mindspore diff --git a/inferrt/src/hardware/ascend/res_manager/ascend_stream_manager.h b/inferrt/src/hardware/ascend/res_manager/ascend_stream_manager.h new file mode 100644 index 0000000000000000000000000000000000000000..b738b3284b9055b427def13135ad30aab703ed1c --- /dev/null +++ b/inferrt/src/hardware/ascend/res_manager/ascend_stream_manager.h @@ -0,0 +1,149 @@ +/** + * Copyright 2021-2025 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_RES_MANAGER_ASCEND_STREAM_MANAGER_ASCEND_STREAM_MANAGER_H_ +#define MINDSPORE_CCSRC_PLUGIN_RES_MANAGER_ASCEND_STREAM_MANAGER_ASCEND_STREAM_MANAGER_H_ + +#include +#include +#include +#include + +#include "acl/acl_rt.h" +#include "hardware/hardware_abstract/visible.h" + +namespace mindspore { +namespace device { +namespace ascend { +class HARDWARE_EXPORT AscendStreamMng { + public: + static AscendStreamMng &GetInstance(); + + ~AscendStreamMng() = default; + void Clear() { + } + + void ResetResource() { + cur_stream_num_ = 0; + cur_event_num_ = 0; + } + + uint32_t ApplyNewStream() { return cur_stream_num_++; } + + uint32_t ApplyNewEvent() { return cur_event_num_++; } + + aclrtEvent ApplyRtEvent(); + aclrtEvent ApplyRtEventWithFlag(uint32_t flag); + uint32_t GetRtEventId(const aclrtEvent &event) const; + void DestroyAllRtEvents(); + + void DeleteEvent(); + + void DeleteStream(); + + uint32_t GetCurAllocStreamId() const; + + uint32_t cur_stream_num() const { return cur_stream_num_; } + + uint32_t cur_event_num() const { return cur_event_num_; } + + void CreateStream(aclrtStream *stream, int32_t priority = 0); + void CreateStream(size_t *stream_id, int32_t priority = 0); + void RegCallback(aclrtStream stream); + void UnRegCallback(aclrtStream stream, bool delete_item = true); + void CreateStreamWithFlags(aclrtStream *stream, uint32_t flags, int32_t priority = 0); + void CreateStreamWithFlags(size_t *stream_id, uint32_t flags, int32_t priority = 0); + bool DestroyStream(size_t stream_id); + bool DestroyAllStreams(); + bool ForceDestroyAllStreams(); + aclrtStream GetStream(size_t stream_id) const; + bool SyncStream(size_t stream_id) const; + bool SyncStream(aclrtStream stream) const; + // 'sync_device' means whether calling 'aclrtSynchronizeDeviceWithTimeout' or 'aclrtSynchronizeStreamWithTimeout'. + bool SyncAllStreams(bool sync_device = true) const; + bool SyncNotDefaultStreams() const; + // Sync all streams except the streams in except_streams. + bool SyncExceptStreamsInList(const std::set &except_streams) const; + size_t QueryStreamSize() const; + bool QueryStream(size_t stream_id); + size_t GetStreamId(void *stream_ptr); + std::vector GetStreamIds() const; + void SetBusyStreamNum(uint32_t stream_num) { busy_stream_num_ = stream_num; } + uint32_t GetBusyStreamNum() const { return busy_stream_num_; } + void SetCopyInStream(aclrtStream stream) { copy_in_stream_ = stream; } + void SetCopyOutStream(aclrtStream stream) { copy_out_stream_ = stream; } + void SetForwardSendStream(aclrtStream stream) { forward_send_stream_ = stream; } + void SetBackwardSendStream(aclrtStream stream) { backward_send_stream_ = stream; } + void SetForwardRecvStream(aclrtStream stream) { forward_recv_stream_ = stream; } + void SetBackwardRecvStream(aclrtStream stream) { backward_recv_stream_ = stream; } + aclrtStream GetCopyInStream() const { return copy_in_stream_; } + aclrtStream GetCopyOutStream() const { return copy_out_stream_; } + aclrtStream GetForwardSendStream() const { return forward_send_stream_; } + aclrtStream GetBackwardSendStream() const { return backward_send_stream_; } + aclrtStream GetForwardRecvStream() const { return forward_recv_stream_; } + aclrtStream GetBackwardRecvStream() const { return backward_recv_stream_; } + + void set_current_stream(size_t stream_id) { current_stream_id_ = stream_id; } + size_t current_stream() const { return current_stream_id_; } + + void CreateDefaultStream(); + size_t default_stream_id() const; + size_t communication_stream_id() const; + aclrtStream default_stream() const; + aclrtStream communication_stream() const; + + bool single_op_multi_stream_enable() const { return single_op_multi_stream_enable_; } + void set_single_op_multi_stream_enable(bool single_op_multi_stream_enable) { + single_op_multi_stream_enable_ = single_op_multi_stream_enable; + } + + private: + // Count streams and events number in task sink scenario + uint32_t cur_stream_num_{0}; + uint32_t cur_event_num_{0}; + + // The max stream num on device ar a time + uint32_t busy_stream_num_{0}; + + // Ensure the thread safety for creating and destroying stream. + std::mutex stream_mutex_; + aclrtStream copy_in_stream_{nullptr}; + aclrtStream copy_out_stream_{nullptr}; + aclrtStream forward_send_stream_{nullptr}; + aclrtStream backward_send_stream_{nullptr}; + aclrtStream forward_recv_stream_{nullptr}; + aclrtStream backward_recv_stream_{nullptr}; + + // all gpu CUDA streams including default_stream_. + std::vector streams_; + std::vector events_{}; + + // Currently using stream id. + size_t current_stream_id_{0}; + + // Default stream. We consider the first stream created as default stream. + aclrtStream default_stream_{nullptr}; + size_t default_stream_id_{0}; + aclrtStream communication_stream_{nullptr}; + size_t communication_stream_id_{0}; + + bool single_op_multi_stream_enable_{false}; +}; +} // namespace ascend +} // namespace device +} // namespace mindspore + +#endif // MINDSPORE_CCSRC_PLUGIN_RES_MANAGER_ASCEND_STREAM_MANAGER_ASCEND_STREAM_MANAGER_H_ diff --git a/inferrt/src/hardware/ascend/res_manager/mem_manager/CMakeLists.txt b/inferrt/src/hardware/ascend/res_manager/mem_manager/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..6c2ef17c933d5b2fa295cc86126ba79dce143d0c --- /dev/null +++ b/inferrt/src/hardware/ascend/res_manager/mem_manager/CMakeLists.txt @@ -0,0 +1,6 @@ +if(ENABLE_D OR ENABLE_ACL) + file(GLOB _ASCEND_MEM_MANAGER_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc") + set_property(SOURCE ${_ASCEND_MEM_MANAGER_SRC_LIST} PROPERTY COMPILE_DEFINITIONS + SUBMODULE_ID=mindspore::SubModuleId::SM_DEVICE) + add_library(_mindspore_ascend_mem_manager_obj OBJECT ${_ASCEND_MEM_MANAGER_SRC_LIST}) +endif() \ No newline at end of file diff --git a/inferrt/src/hardware/ascend/res_manager/mem_manager/abstract_ascend_memory_pool_support.cc b/inferrt/src/hardware/ascend/res_manager/mem_manager/abstract_ascend_memory_pool_support.cc new file mode 100644 index 0000000000000000000000000000000000000000..978b2569e1085f62db73a2176f7074c1b2a6af68 --- /dev/null +++ b/inferrt/src/hardware/ascend/res_manager/mem_manager/abstract_ascend_memory_pool_support.cc @@ -0,0 +1,189 @@ +/** + * Copyright 2024 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "hardware/ascend/res_manager/mem_manager/abstract_ascend_memory_pool_support.h" + +#include +#include + +#include "hardware/ascend/res_manager/mem_manager/ascend_memory_adapter.h" +#include "hardware/ascend/res_manager/mem_manager/ascend_gmem_adapter.h" +#include "hardware/ascend/res_manager/mem_manager/ascend_vmm_adapter.h" +#include "hardware/ascend/res_manager/ascend_stream_manager.h" +#include "common/common.h" +#include "hardware/ascend/res_manager/symbol_interface/acl_rt_symbol.h" +#include "hardware/ascend/res_manager/symbol_interface/symbol_utils.h" + +namespace mindspore { +namespace device { +namespace ascend { +// The minimum unit size (8MB) of memory block used for dynamic extend in graph run mode. +static const size_t ASCEND_COMMON_POOL_ALLOC_UNIT_SIZE_FOR_GRAPH_RUN_MODE = 8 << 20; +constexpr char kGlobalOverflowWorkspace[] = "GLOBAL_OVERFLOW_WORKSPACE"; + +void AbstractAscendMemoryPoolSupport::SetMemPoolBlockSize(size_t available_device_mem_size) { + // set by default configuration + SetMemAllocUintSize(kDynamicMemAllocUnitSize, kDynamicMemAllocUnitSize); +} + +namespace { +bool NoAdditionalMemory() { + // use default temporarily. + return true; +} +} // namespace + +size_t AbstractAscendMemoryPoolSupport::CalMemBlockAllocSize(size_t size, bool from_persistent_mem, bool need_recycle) { + auto device_free_mem_size = free_mem_size(); + if (device_free_mem_size < size) { + LOG_OUT << "The device memory is not enough, the free memory size is " << device_free_mem_size + << ", but the alloc size is " << size; + LOG_OUT << "The dynamic memory pool total size is " << TotalMemStatistics() / kMBToByte << "M, total used size is " + << TotalUsedMemStatistics() / kMBToByte << "M, used peak size is " << UsedMemPeakStatistics() / kMBToByte + << "M."; + LOG_OUT << "Memory Statistics:" << AscendMemAdapter::GetInstance()->DevMemStatistics(); + return 0; + } + + size_t alloc_mem_size; + SetMemPoolBlockSize(device_free_mem_size); + auto alloc_mem_unit_size = MemAllocUnitSize(from_persistent_mem); + if (need_recycle) { + alloc_mem_unit_size = kDynamicMemAllocUnitSize; + } + LOG_OUT << "Get unit block size " << alloc_mem_unit_size; + alloc_mem_size = alloc_mem_unit_size; + + const bool is_graph_run_mode = true; + if (is_graph_run_mode) { + // Growing at adding alloc unit size + while (alloc_mem_size < size) { + alloc_mem_size = alloc_mem_size + alloc_mem_unit_size; + } + } else { + // Growing at twice of alloc unit size + constexpr size_t kDouble = 2; + while (alloc_mem_size < size) { + alloc_mem_size = alloc_mem_size * kDouble; + } + } + + alloc_mem_size = std::min(alloc_mem_size, device_free_mem_size); + if (NoAdditionalMemory() && !need_recycle) { + alloc_mem_size = std::min(alloc_mem_size, size); + } + return alloc_mem_size; +} + +size_t AbstractAscendMemoryPoolSupport::AllocDeviceMem(size_t size, DeviceMemPtr *addr) { + LOG_OUT << "Malloc Memory for Pool, size: " << size; + if (size == 0) { + LOG_ERROR << "Failed to alloc memory pool resource, the size is zero!"; + } + *addr = AscendMemAdapter::GetInstance()->MallocStaticDevMem(size); + if (*addr == nullptr) { + LOG_ERROR << "Alloc device memory pool address is nullptr, failed to alloc memory pool resource!"; + } + return size; +} + +size_t AbstractAscendMemoryPoolSupport::GetMaxUsedMemSize() const { + void *min_used_addr = GetMinUsingMemoryAddr(); + if (min_used_addr == nullptr) { + return 0; + } + return AscendMemAdapter::GetInstance()->GetDynamicMemUpperBound(min_used_addr); +} + +size_t AbstractAscendMemoryPoolSupport::GetVmmUsedMemSize() const { + if (IsEnableVmm()) { + return AscendVmmAdapter::GetInstance().GetAllocatedSize(); + } + return 0; +} + +const bool AbstractAscendMemoryPoolSupport::IsEnableEagerFree() const { + return AscendGmemAdapter::GetInstance().is_eager_free_enabled(); +} + +const bool AbstractAscendMemoryPoolSupport::SyncAllStreams() { return AscendStreamMng::GetInstance().SyncAllStreams(); } + +size_t AbstractAscendMemoryPoolSupport::AllocDeviceMemByEagerFree(size_t size, DeviceMemPtr *addr) { + if (IsEnableVmm()) { + return AscendVmmAdapter::GetInstance().AllocDeviceMem(size, addr); + } else if (IsEnableEagerFree()) { + return AscendGmemAdapter::GetInstance().AllocDeviceMem(size, addr); + } else { + LOG_ERROR << "Eager free and VMM are both disabled."; + return 0; + } +} + +size_t AbstractAscendMemoryPoolSupport::FreeDeviceMemByEagerFree(const DeviceMemPtr addr, const size_t size) { + if (IsEnableVmm()) { + return AscendVmmAdapter::GetInstance().EagerFreeDeviceMem(addr, size); + } else if (IsEnableEagerFree()) { + return AscendGmemAdapter::GetInstance().EagerFreeDeviceMem(addr, size); + } else { + LOG_ERROR << "Eager free and VMM are both disabled."; + return 0; + } +} + +size_t AbstractAscendMemoryPoolSupport::EmptyCache() { return AscendVmmAdapter::GetInstance().EmptyCache(); } + +size_t AbstractAscendMemoryPoolSupport::MmapDeviceMem(const size_t size, const DeviceMemPtr addr) { + if (IsEnableVmm()) { + return AscendVmmAdapter::GetInstance().MmapDeviceMem(size, addr, total_mem_size()); + } else if (IsEnableEagerFree()) { + auto ret = AscendGmemAdapter::GetInstance().MmapMemory(size, addr); + if (ret == nullptr) { + LOG_ERROR << "Mmap memory failed."; + } + return size; + } + LOG_ERROR << "Eager free and VMM are both disabled."; + return 0; +} + +bool AbstractAscendMemoryPoolSupport::FreeDeviceMem(const DeviceMemPtr &addr) { + CHECK_IF_NULL(addr); + int64_t max_actual = ActualPeakStatistics(); + LOG_OUT << "Max actual used memory size is " << max_actual; + AscendMemAdapter::GetInstance()->UpdateActualPeakMemory(max_actual); + int64_t max_peak = UsedMemPeakStatistics(); + LOG_OUT << "Max peak used memory size is " << max_peak; + AscendMemAdapter::GetInstance()->UpdateUsedPeakMemory(max_peak); + // disable ge kernel use two pointer mem adapter, not support free. + // if (!IsEnableVmm() && !IsEnableEagerFree() && !IsDisableGeKernel()) { + // return AscendMemAdapter::GetInstance()->FreeStaticDevMem(addr); + // } + return true; +} + +void AbstractAscendMemoryPoolSupport::ResetIdleMemBuf() const { + // Warning : This method is not in used currently, removed in next release. +} + +size_t AbstractAscendMemoryPoolSupport::free_mem_size() { return AscendMemAdapter::GetInstance()->FreeDevMemSize(); } + +uint64_t AbstractAscendMemoryPoolSupport::total_mem_size() const { + + return AscendMemAdapter::GetInstance()->MaxHbmSizeForMs(); +} +} // namespace ascend +} // namespace device +} // namespace mindspore diff --git a/inferrt/src/hardware/ascend/res_manager/mem_manager/abstract_ascend_memory_pool_support.h b/inferrt/src/hardware/ascend/res_manager/mem_manager/abstract_ascend_memory_pool_support.h new file mode 100644 index 0000000000000000000000000000000000000000..f5183ef4f5cffa870f3c9d27a62e657611f44816 --- /dev/null +++ b/inferrt/src/hardware/ascend/res_manager/mem_manager/abstract_ascend_memory_pool_support.h @@ -0,0 +1,71 @@ +/** + * Copyright 2024 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_RUNTIME_DEVICE_ABSTRACT_ASCEND_ASCEND_MEMORY_POOL_SUPPORT_H_ +#define MINDSPORE_CCSRC_RUNTIME_DEVICE_ABSTRACT_ASCEND_ASCEND_MEMORY_POOL_SUPPORT_H_ + +#include + +#include "hardware/hardware_abstract/memory/dynamic_mem_pool.h" +#include "hardware/hardware_abstract/visible.h" + +namespace mindspore { +namespace device { +namespace ascend { +// Definition for abstract ascend memory pool support class, wrap device interface of ascend. +class HARDWARE_EXPORT AbstractAscendMemoryPoolSupport : virtual public DynamicMemPool { + public: + ~AbstractAscendMemoryPoolSupport() override = default; + + size_t AllocDeviceMem(size_t size, DeviceMemPtr *addr) override; + + bool FreeDeviceMem(const DeviceMemPtr &addr) override; + + size_t MmapDeviceMem(const size_t size, const DeviceMemPtr addr) override; + + size_t GetMaxUsedMemSize() const override; + + size_t GetVmmUsedMemSize() const override; + + size_t free_mem_size() override; + + uint64_t total_mem_size() const override; + + // Set mem pool block size + void SetMemPoolBlockSize(size_t available_device_mem_size) override; + + virtual void ResetIdleMemBuf() const; + + // Calculate memory block required alloc size when adding the memory block. + size_t CalMemBlockAllocSize(size_t size, bool from_persistent_mem, bool need_recycle) override; + + // The related interface of device memory eager free. + const bool IsEnableEagerFree() const override; + + const bool SyncAllStreams() override; + + size_t AllocDeviceMemByEagerFree(size_t size, DeviceMemPtr *addr) override; + + size_t FreeDeviceMemByEagerFree(const DeviceMemPtr addr, const size_t size) override; + + size_t EmptyCache() override; +}; +using AbstractAscendMemoryPoolSupportPtr = std::shared_ptr; +} // namespace ascend +} // namespace device +} // namespace mindspore + +#endif // #define MINDSPORE_CCSRC_RUNTIME_DEVICE_ABSTRACT_ASCEND_ASCEND_MEMORY_POOL_SUPPORT_H_ diff --git a/inferrt/src/hardware/ascend/res_manager/mem_manager/ascend_dynamic_mem_adapter.cc b/inferrt/src/hardware/ascend/res_manager/mem_manager/ascend_dynamic_mem_adapter.cc new file mode 100644 index 0000000000000000000000000000000000000000..497763187ade5f3fd192a29531a573715034d4a2 --- /dev/null +++ b/inferrt/src/hardware/ascend/res_manager/mem_manager/ascend_dynamic_mem_adapter.cc @@ -0,0 +1,127 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "hardware/ascend/res_manager/mem_manager/ascend_dynamic_mem_adapter.h" +#include +#include +#include "common/common.h" + +#include "hardware/ascend/res_manager/mem_manager/ascend_gmem_adapter.h" +#include "hardware/ascend/res_manager/mem_manager/ascend_vmm_adapter.h" +#include "hardware/ascend/res_manager/symbol_interface/acl_rt_symbol.h" +#include "hardware/ascend/res_manager/symbol_interface/symbol_utils.h" + +namespace mindspore { +namespace device { +namespace ascend { +constexpr size_t kMBToByte = 1024 << 10; + +uint8_t *AscendDynamicMemAdapter::MallocStaticDevMem(size_t size, const std::string &tag) { + std::lock_guard locker(mutex_); + if (has_alloc_size + size > LongToSize(max_available_ms_hbm_size_)) { + LOG_ERROR << "No enough memory to allocate, has_alloc_size:" << has_alloc_size << ", size:" << size + << ", max_available_ms_moc_size:" << max_available_ms_hbm_size_; + } + auto addr = MallocFromRts(size); + if (addr != nullptr) { + has_alloc_size += size; + (void)static_memory_blocks_.emplace(addr, std::make_shared(addr, size, tag)); + LOG_OUT << "MallocStaticDevMem success, size:" << size << ", tag:" << tag; + } + return addr; +} + +bool AscendDynamicMemAdapter::FreeStaticDevMem(void *addr) { + LOG_OUT << "FreeStaticDevMem addr:" << addr << "."; + std::lock_guard locker(mutex_); + if (addr == nullptr) { + LOG_ERROR << "addr is nullptr."; + return false; + } + auto &&iter = static_memory_blocks_.find(addr); + if (iter == static_memory_blocks_.end()) { + LOG_ERROR << "addr is not in static memory blocks, addr:" << addr << "."; + return false; + } + auto mem_block = iter->second; + auto ret = FreeToRts(mem_block->mem_ptr, mem_block->mem_size); + if (!ret) { + LOG_ERROR << "Free memory failed."; + return false; + } + LOG_OUT << "Free memory success, addr:" << addr << ", size:" << mem_block->mem_size << "."; + has_alloc_size -= mem_block->mem_size; + static_memory_blocks_.erase(addr); + return true; +} + +bool AscendDynamicMemAdapter::Initialize() { + if (initialized_) { + return true; + } + (void)AscendMemAdapter::Initialize(); + initialized_ = true; + LOG_OUT << "Ascend Memory Adapter initialize success, Memory Statistics:" << DevMemStatistics(); + return true; +} + +bool AscendDynamicMemAdapter::DeInitialize() { + for (const auto &[addr, blk] : static_memory_blocks_) { + if (blk->mem_ptr != nullptr) { + auto ret = FreeToRts(blk->mem_ptr, blk->mem_size); + if (!ret) { + LOG_ERROR << "Free memory failed."; + return false; + } + LOG_OUT << "Free memory success, addr:" << addr << ", size:" << blk->mem_size << ", tag:" << blk->mem_tag; + } + } + (void)AscendMemAdapter::DeInitialize(); + has_alloc_size = 0; + static_memory_blocks_.clear(); + initialized_ = false; + return true; +} + +uint64_t AscendDynamicMemAdapter::FreeDevMemSize() const { return max_available_ms_hbm_size_ - has_alloc_size; } + +uint8_t *AscendDynamicMemAdapter::MallocDynamicDevMem(size_t size, const std::string &) { + LOG_ERROR << "MallocDynamicDevMem is disabled."; + return nullptr; +} + +void AscendDynamicMemAdapter::ResetDynamicMemory() { LOG_ERROR << "ResetDynamicMemory is disabled."; } + +std::string AscendDynamicMemAdapter::DevMemStatistics() const { + std::ostringstream oss; + oss << "\nDevice MOC memory size: " << device_hbm_total_size_ / kMBToByte << "M"; + oss << "\nMindSpore Used memory size: " << ms_used_hbm_size_ / kMBToByte << "M"; + auto print_actual_peak_memory = AscendVmmAdapter::GetInstance().IsEnabled() + ? AscendVmmAdapter::GetInstance().GetAllocatedSize() + : actual_peak_memory_; + oss << "\nUsed peak memory usage (without fragments): " << used_peak_memory_ / kMBToByte << "M"; + oss << "\nActual peak memory usage (with fragments): " << print_actual_peak_memory / kMBToByte << "M"; + oss << std::endl; + return oss.str(); +} + +size_t AscendDynamicMemAdapter::GetDynamicMemUpperBound(void *min_static_addr) const { + LOG_ERROR << "GetDynamicMemUpperBound is disabled."; + return 0; +} +} // namespace ascend +} // namespace device +} // namespace mindspore diff --git a/inferrt/src/hardware/ascend/res_manager/mem_manager/ascend_dynamic_mem_adapter.h b/inferrt/src/hardware/ascend/res_manager/mem_manager/ascend_dynamic_mem_adapter.h new file mode 100644 index 0000000000000000000000000000000000000000..7f7eeef077735bde4f9bedd1b6b5dca54143bbf6 --- /dev/null +++ b/inferrt/src/hardware/ascend/res_manager/mem_manager/ascend_dynamic_mem_adapter.h @@ -0,0 +1,47 @@ +/** + * Copyright 2024 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_ASCEND_DYNAMIC_MEM_ADAPTER_H_ +#define MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_ASCEND_DYNAMIC_MEM_ADAPTER_H_ + +#include "hardware/ascend/res_manager/mem_manager/ascend_memory_adapter.h" +#include +#include +#include + +namespace mindspore { +namespace device { +namespace ascend { +class AscendDynamicMemAdapter : public AscendMemAdapter { + public: + bool Initialize() override; + bool DeInitialize() override; + uint8_t *MallocStaticDevMem(size_t size, const std::string &tag = "") override; + bool FreeStaticDevMem(void *addr) override; + uint8_t *MallocDynamicDevMem(size_t size, const std::string &tag = "") override; + void ResetDynamicMemory() override; + std::string DevMemStatistics() const override; + size_t GetDynamicMemUpperBound(void *min_static_addr) const override; + [[nodiscard]] uint64_t FreeDevMemSize() const override; + + private: + size_t has_alloc_size = 0; + std::map> static_memory_blocks_; +}; +} // namespace ascend +} // namespace device +} // namespace mindspore +#endif // MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_ASCEND_DYNAMIC_MEM_ADAPTER_H_ diff --git a/inferrt/src/hardware/ascend/res_manager/mem_manager/ascend_gmem_adapter.cc b/inferrt/src/hardware/ascend/res_manager/mem_manager/ascend_gmem_adapter.cc new file mode 100644 index 0000000000000000000000000000000000000000..5a7dfab491d94ceaba9e1ccbd61813aab2fbdd08 --- /dev/null +++ b/inferrt/src/hardware/ascend/res_manager/mem_manager/ascend_gmem_adapter.cc @@ -0,0 +1,127 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "hardware/ascend/res_manager/mem_manager/ascend_gmem_adapter.h" +#include +#include +#include +#include +#include + +#include +#include "common/common.h" +#include "hardware/ascend/res_manager/ascend_stream_manager.h" + +namespace mindspore { +namespace device { +namespace ascend { +static constexpr const char kGMemLibName[] = "libgmem.so"; +static constexpr const char kMsEnableGmem[] = "MS_ENABLE_GMEM"; +constexpr uint64_t kAscendMmapAlignSize = 1 << 21; +constexpr int kMapPeerShared = 0x8000000; + +const size_t AscendGmemAdapter::GetRoundUpAlignSize(size_t input_size) const { + return (input_size + kAscendMmapAlignSize - 1) & ~(kAscendMmapAlignSize - 1); +} + +const size_t AscendGmemAdapter::GetRoundDownAlignSize(size_t input_size) const { + return input_size & ~(kAscendMmapAlignSize - 1); +} + +size_t AscendGmemAdapter::AllocDeviceMem(size_t size, DeviceMemPtr *addr) const { + size_t align_size = GetRoundUpAlignSize(size); + uint8_t *alloc_addr = MmapMemory(align_size, nullptr); + if (alloc_addr == nullptr) { + LOG_OUT << "Malloc memory failed."; + return 0; + } + *addr = alloc_addr; + return align_size; +} + +size_t AscendGmemAdapter::EagerFreeDeviceMem(const DeviceMemPtr addr, const size_t size) const { + CHECK_IF_NULL(addr); + LOG_OUT << "Enter ascend eager free device mem, addr : " << addr << ", size : " << size << "."; + if (size == 0) { + LOG_OUT << "Eager free device mem, addr : " << addr << ", size is zero."; + return 0; + } + size_t addr_size_t = reinterpret_cast(addr); + // Adjust addr -> round up addr, size -> round down size. + size_t from_addr = GetRoundUpAlignSize(addr_size_t); + size_t end_addr = GetRoundDownAlignSize(addr_size_t + size); + if (end_addr <= from_addr) { + LOG_OUT << "End addr : " << end_addr << " is not bigger than from_addr : " << from_addr << "."; + return 0; + } + size_t real_size = end_addr - from_addr; + int ret = free_eager_(from_addr, SizeToUlong(real_size), nullptr); + return ret != 0 ? 0 : real_size; +} + +uint8_t *AscendGmemAdapter::MmapMemory(size_t size, void *addr) const { + LOG_OUT << "Enter mmap memory, size : " << size << "."; + if (size == 0) { + LOG_ERROR << "Mmap memory, addr : " << addr << ", size is zero."; + return nullptr; + } + + int flags = MAP_PRIVATE | MAP_ANONYMOUS | kMapPeerShared; + int prot = PROT_READ | PROT_WRITE; + void *mapped_addr = mmap(addr, size, prot, flags, -1, 0); + if (mapped_addr == MAP_FAILED) { + LOG_ERROR << "Mmap failed."; + } + return static_cast(mapped_addr); +} + +bool AscendGmemAdapter::MunmapMemory(void *addr, const size_t size) const { + LOG_OUT << "Enter munmap memory, addr : " << addr << ", size : " << size << "."; + auto ret = munmap(addr, size); + return ret != -1; +} + +void AscendGmemAdapter::LoadGMemLib() noexcept { + LOG_OUT << "MS_ENABLE_GMEM is set, try to open gmem."; + gmem_handle_ = dlopen(kGMemLibName, RTLD_NOW); + if (gmem_handle_ != nullptr) { + LOG_OUT << "Open GMem lib success, mindspore will use gmem to optimize memory usage."; + LIB_FUNC(GMEM_FREE_EAGER) gmem_free_eager = DlsymFuncObj(gmemFreeEager, gmem_handle_); + if (gmem_free_eager != nullptr) { + is_eager_free_enabled_ = true; + free_eager_ = gmem_free_eager; + } else { + LOG_OUT << "Load gmem free eager failed."; + if (dlclose(gmem_handle_) != 0) { + LOG_ERROR << "Close GMem lib failed, detail : " << dlerror() << "."; + } + } + } else { + LOG_OUT << "Open GMem lib failed."; + } +} + +void AscendGmemAdapter::UnloadGMemLib() noexcept { + if (gmem_handle_ != nullptr) { + LOG_OUT << "Close GMem lib."; + if (dlclose(gmem_handle_) != 0) { + LOG_ERROR << "Close GMem lib failed, detail : " << dlerror() << "."; + } + gmem_handle_ = nullptr; + } +} +} // namespace ascend +} // namespace device +} // namespace mindspore diff --git a/inferrt/src/hardware/ascend/res_manager/mem_manager/ascend_gmem_adapter.h b/inferrt/src/hardware/ascend/res_manager/mem_manager/ascend_gmem_adapter.h new file mode 100644 index 0000000000000000000000000000000000000000..3edb8bd61ade23694fab1434fd44ece3342fa96a --- /dev/null +++ b/inferrt/src/hardware/ascend/res_manager/mem_manager/ascend_gmem_adapter.h @@ -0,0 +1,76 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_ASCEND_GMEM_ADAPTER_H_ +#define MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_ASCEND_GMEM_ADAPTER_H_ + +#include +#include + +#include "acl/acl.h" +#include "hardware/hardware_abstract/dlopen_macro.h" + +namespace mindspore { +namespace device { +namespace ascend { +#define CONCAT(l, r) l##r +// Function Object definition marco. +#define LIB_FUNC(func_name) CONCAT(func_name, FunObj) +// Function definition marco, and then can ues `LIB_FUNC(func_name)`. +#define DEFINE_LIB_METHOD(func_name, ...) ORIGIN_METHOD(func_name, __VA_ARGS__) + +// GMem mem free eager function name. Need to use origin name when export symbol from lib. +#define GMEM_FREE_EAGER gmemFreeEager +// Definition for GMem lib function : GMEM_FREE_EAGER. +DEFINE_LIB_METHOD(GMEM_FREE_EAGER, size_t, uint64_t, size_t, void *); + +using DeviceMemPtr = void(*); +class AscendGmemAdapter { + public: + static AscendGmemAdapter &GetInstance() { + static AscendGmemAdapter instance{}; + return instance; + } + + AscendGmemAdapter() { LoadGMemLib(); } + ~AscendGmemAdapter() { UnloadGMemLib(); } + + public: + const size_t GetRoundUpAlignSize(size_t input_size) const; + const size_t GetRoundDownAlignSize(size_t input_size) const; + + size_t AllocDeviceMem(size_t size, DeviceMemPtr *addr) const; + size_t EagerFreeDeviceMem(const DeviceMemPtr addr, const size_t size) const; + + uint8_t *MmapMemory(size_t size, void *addr) const; + bool MunmapMemory(void *addr, const size_t size) const; + + inline const bool is_eager_free_enabled() const { return is_eager_free_enabled_; } + + private: + void LoadGMemLib() noexcept; + void UnloadGMemLib() noexcept; + + bool is_eager_free_enabled_{false}; + void *gmem_handle_{nullptr}; + // Function for eager free. + LIB_FUNC(GMEM_FREE_EAGER) free_eager_; +}; +} // namespace ascend +} // namespace device +} // namespace mindspore + +#endif diff --git a/inferrt/src/hardware/ascend/res_manager/mem_manager/ascend_memory_adapter.cc b/inferrt/src/hardware/ascend/res_manager/mem_manager/ascend_memory_adapter.cc new file mode 100644 index 0000000000000000000000000000000000000000..a848e12bdc6790721699d588f242e3e946d6e584 --- /dev/null +++ b/inferrt/src/hardware/ascend/res_manager/mem_manager/ascend_memory_adapter.cc @@ -0,0 +1,295 @@ +/** + * Copyright 2021-2024 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "hardware/ascend/res_manager/mem_manager/ascend_memory_adapter.h" +#include "hardware/ascend/res_manager/mem_manager/ascend_dynamic_mem_adapter.h" +#include "hardware/ascend/res_manager/mem_manager/ascend_gmem_adapter.h" +#include "hardware/ascend/res_manager/mem_manager/ascend_vmm_adapter.h" +#include "hardware/ascend/res_manager/symbol_interface/acl_rt_symbol.h" +#include "hardware/ascend/res_manager/symbol_interface/symbol_utils.h" +#include "common/common.h" + +namespace mindspore { +namespace device { +namespace ascend { +namespace { +constexpr size_t kMBToByte = 1024 << 10; +constexpr size_t kGBToByte = 1024 << 20; +constexpr uint64_t kAscendMemAlignSize = 512; +constexpr double kHalfRatio = 0.5; +constexpr double kMSMemoryRatio = 0.9375; // 15/16 +constexpr double kReservedMemoryRatio = 0.0625; // 1/16 +constexpr size_t kPerHugePageMemorySize = 2097152; // 2mb +constexpr size_t kExtraReservedMemory = 10485760; // 10mb +constexpr size_t kSimuHBMTotalMemSizeGB = 64; +} // namespace +AscendMemAdapterPtr AscendMemAdapter::instance_ = nullptr; + +AscendMemAdapterPtr AscendMemAdapter::GetInstance() { + if (instance_ == nullptr) { + instance_ = std::make_shared(); + } + return instance_; +} + +size_t AscendMemAdapter::GetRoundDownAlignSize(size_t input_size) { + return (input_size / kAscendMemAlignSize) * kAscendMemAlignSize; +} + +size_t AscendMemAdapter::GetRoundUpAlignSize(size_t input_size) { + return ((input_size + kAscendMemAlignSize - 1) / kAscendMemAlignSize) * kAscendMemAlignSize; +} + +size_t AscendMemAdapter::GetDeviceMemSizeFromContext() const { + size_t size_from_context; + float total_device_memory = 32.0f; + auto max_device_memory = total_device_memory; + // if (context->ascend_soc_version() == kAscendVersion910b || context->ascend_soc_version() == kAscendVersion910_93) { + // total_device_memory = 64.0f; + // } + // if (context->ascend_soc_version() == kAscendVersion310p) { + // total_device_memory = 43.0f; + // } + LOG_OUT << "context max_device_memory:" << max_device_memory; + size_from_context = FloatToSize(max_device_memory * kGBToByte); + + return size_from_context; +} + +bool AscendMemAdapter::Initialize() { + if (initialized_) { + return true; + } + + // use 0 temporarily. + float huge_page_reserve_size = 0; + device_hbm_huge_page_reserved_size_ = static_cast(huge_page_reserve_size * kGBToByte); + if (AscendVmmAdapter::IsEnabled() && device_hbm_huge_page_reserved_size_ > 0) { + LOG_OUT << "Reserve huge page feature is not available when VMM is enabled."; + } + LOG_OUT << "Config huge_page_reserve_size : " << huge_page_reserve_size + << ", device_hbm_huge_page_reserved_size_ : " << device_hbm_huge_page_reserved_size_; + + auto ret = CALL_ASCEND_API(aclrtGetMemInfo, ACL_HBM_MEM, &device_hbm_free_size_, &device_hbm_total_size_); + if (ret != ACL_SUCCESS || device_hbm_total_size_ == 0) { + LOG_ERROR << "Internal Error: Get Device MOC memory size failed, ret = " << ret + << ", total MOC size :" << device_hbm_total_size_; + } + + if (device_hbm_free_size_ < LongToSize(DoubleToLong(device_hbm_total_size_ * kHalfRatio))) { + // use 0 temporarily. + unsigned int device_id = 0; + LOG_OUT << "Free memory size is less " + "than half of total memory size." + << "Device " << device_id << " Device MOC total size:" << device_hbm_total_size_ + << " Device MOC free size:" << device_hbm_free_size_ + << " may be other processes occupying this card, check as: ps -ef|grep python"; + } + + // get user define max backend memory + auto user_define_ms_size = GetDeviceMemSizeFromContext(); + auto recommend_mem_size_for_others = LongToSize(DoubleToLong(device_hbm_free_size_ * kReservedMemoryRatio)); + size_t reserved_mem_size_for_others; + if (user_define_ms_size == 0) { + ms_used_hbm_size_ = DoubleToLong(device_hbm_free_size_ * kMSMemoryRatio); + // sub the extra reserved 10mb after rounding down the 2mb + ms_used_hbm_size_ = (ms_used_hbm_size_ / kPerHugePageMemorySize) * kPerHugePageMemorySize - kExtraReservedMemory; + reserved_mem_size_for_others = device_hbm_free_size_ - SizeToLong(ms_used_hbm_size_); + } else { + if (user_define_ms_size >= device_hbm_free_size_) { + LOG_ERROR << "#umsg#Framework Error Message:#umsg#The Free Device Memory Size is " + << (SizeToFloat(device_hbm_free_size_) / kGBToByte) << " GB, max_device_memory should be in range (0-" + << (SizeToFloat(device_hbm_free_size_) / kMBToByte) << "]MB, but got " + << (SizeToFloat(user_define_ms_size) / kMBToByte) + << "MB, please set the context key max_device_memory in valid range."; + } + ms_used_hbm_size_ = SizeToLong(user_define_ms_size); + + reserved_mem_size_for_others = device_hbm_total_size_ - LongToSize(ms_used_hbm_size_); + if (reserved_mem_size_for_others < recommend_mem_size_for_others) { + LOG_OUT << "Reserved memory size for other components(" << reserved_mem_size_for_others + << ") is less than recommend size(" << recommend_mem_size_for_others + << "), It may lead to Out Of Memory in HCCL or other components, Please double check context key " + "'variable_memory_max_size'/'max_device_memory'"; + } + } + + if (AscendVmmAdapter::GetInstance().IsEnabled()) { + ms_used_hbm_size_ = SizeToLong(AscendVmmAdapter::GetInstance().GetRoundDownAlignSize(ms_used_hbm_size_)); + } else if (AscendGmemAdapter::GetInstance().is_eager_free_enabled()) { + ms_used_hbm_size_ = SizeToLong(AscendGmemAdapter::GetInstance().GetRoundDownAlignSize(ms_used_hbm_size_)); + } else { + ms_used_hbm_size_ = SizeToLong(GetRoundDownAlignSize(ms_used_hbm_size_)); + } + max_available_ms_hbm_size_ = ms_used_hbm_size_; + + auto get_init_info = [this, &reserved_mem_size_for_others, &recommend_mem_size_for_others, + &user_define_ms_size]() -> std::string { + std::ostringstream oss; + oss << "Device MOC Size:" << device_hbm_total_size_ / kMBToByte + << "M, Device free MOC Size:" << device_hbm_free_size_ / kMBToByte + << "M, Reserved MOC size for Other Components(HCCL/rts/etc.):" << reserved_mem_size_for_others / kMBToByte + << "M, Recommend Reserved MOC size for Other Components:" << recommend_mem_size_for_others / kMBToByte + << "M, User define MindSpore MOC Size:" << user_define_ms_size / kGBToByte + << "G, MindSpore Used MOC Size:" << ms_used_hbm_size_ / kMBToByte << "M."; + return oss.str(); + }; + + LOG_OUT << get_init_info(); + initialized_ = true; + return true; +} + +void AscendMemAdapter::SimulationInitialize() { + device_hbm_total_size_ = kSimuHBMTotalMemSizeGB * kGBToByte; + device_hbm_free_size_ = device_hbm_total_size_; + size_t reserved_mem_size_for_others; + auto user_define_ms_size = GetDeviceMemSizeFromContext(); + if (user_define_ms_size == 0) { + ms_used_hbm_size_ = DoubleToLong(device_hbm_free_size_ * kMSMemoryRatio); + ms_used_hbm_size_ = (ms_used_hbm_size_ / kPerHugePageMemorySize) * kPerHugePageMemorySize - kExtraReservedMemory; + reserved_mem_size_for_others = device_hbm_free_size_ - SizeToLong(ms_used_hbm_size_); + } else { + ms_used_hbm_size_ = SizeToLong(user_define_ms_size); + if (user_define_ms_size > device_hbm_total_size_) { + device_hbm_total_size_ = user_define_ms_size; + } + reserved_mem_size_for_others = device_hbm_total_size_ - user_define_ms_size; + } + + LOG_OUT << "Simulation Device MOC Size:" << device_hbm_total_size_ / kMBToByte + << "M, Device free MOC Size:" << device_hbm_free_size_ / kMBToByte + << "M, Reserved MOC size for Other Components(HCCL/rts/etc.):" << reserved_mem_size_for_others / kMBToByte + << "M, User define MindSpore MOC Size:" << user_define_ms_size / kGBToByte + << "G, MindSpore Used MOC Size:" << ms_used_hbm_size_ / kMBToByte << "M."; + max_available_ms_hbm_size_ = ms_used_hbm_size_; + initialized_ = true; +} + +bool AscendMemAdapter::DeInitialize() { + if (!initialized_) { + LOG_OUT << "DeInitialize Ascend Memory Adapter when it is not initialize"; + return false; + } + std::ostringstream oss_buf; + oss_buf << "Ascend Memory Adapter deinitialize success, statistics:" << DevMemStatistics(); + LOG_OUT << oss_buf.str(); + device_hbm_total_size_ = 0; + device_hbm_free_size_ = 0; + ms_used_hbm_size_ = 0; + max_available_ms_hbm_size_ = 0; + initialized_ = false; + return true; +} + +namespace { +struct HugeMemReserver { + HugeMemReserver(size_t size, size_t reserver_size) { + LOG_OUT << "Allocate size : " << size << ", reserve_size : " << reserver_size << "."; + if (reserver_size < kMBToByte) { + return; + } + size_t free_size = 0; + size_t total_size = 0; + auto ret = CALL_ASCEND_API(aclrtGetMemInfo, ACL_HBM_MEM_HUGE, &free_size, &total_size); + LOG_OUT << "Huge mem reserve free_size : " << free_size << ", total_size : " << total_size << "."; + if (ret == ACL_SUCCESS) { + if (free_size < reserver_size + size) { + LOG_OUT << "Free size of huge page mem[" << free_size + << "] is less than the sum of reserver_size and allocate size. Reserve size " << reserver_size + << ", allocate size : " << size << ", total ACL_HBM_MEM_HUGE size : " << total_size << "."; + if (free_size < reserver_size) { + LOG_ERROR << "Free size of huge page mem[" << free_size << "] is less than reserver_size : " << reserver_size + << ", change reserve operation with free size."; + reserver_size = free_size; + } + ret = CALL_ASCEND_API(aclrtMalloc, reinterpret_cast(&addr_), reserver_size, ACL_MEM_MALLOC_HUGE_ONLY); + if (ret != ACL_RT_SUCCESS) { + addr_ = nullptr; + LOG_ERROR << "aclrtMalloc mem size[" << reserver_size << "] fail, ret[" << ret << "]"; + } else { + LOG_OUT << "Huge mem reserve success, addr : " << addr_ << ", size : " << reserver_size << "."; + } + } + } else { + LOG_OUT << "aclrtGetMemInfo mem size[" << size << "] fail, ret[" << ret << "]"; + } + } + + ~HugeMemReserver() { + if (addr_ != nullptr) { + auto ret = CALL_ASCEND_API(aclrtFree, addr_); + if (ret != ACL_SUCCESS) { + LOG_ERROR << "aclrtFree mem [" << addr_ << "] fail, ret[" << ret << "]"; + } else { + LOG_OUT << "Huge mem reserve success, free : " << addr_ << "."; + } + } + } + + void *addr_{nullptr}; +}; +} // namespace + +uint8_t *AscendMemAdapter::MallocFromRts(size_t size) const { + uint8_t *ptr = nullptr; + if (AscendVmmAdapter::GetInstance().IsEnabled()) { + return nullptr; + } + if (AscendGmemAdapter::GetInstance().is_eager_free_enabled()) { + return AscendGmemAdapter::GetInstance().MmapMemory(size, reinterpret_cast(ptr)); + } + + HugeMemReserver huge_mem_reserver(size, device_hbm_huge_page_reserved_size_); + auto ret = CALL_ASCEND_API(aclrtMalloc, reinterpret_cast(&ptr), size, ACL_MEM_TYPE_HIGH_BAND_WIDTH); + if (ret != ACL_RT_SUCCESS) { + if (ret == ACL_ERROR_RT_MEMORY_ALLOCATION) { + // use 0 temporarily. + unsigned int device_id = 0; + size_t free_size = 0; + size_t total = 0; + (void)CALL_ASCEND_API(aclrtGetMemInfo, ACL_HBM_MEM, &free_size, &total); + LOG_ERROR << "#umsg#Framework Error Message:#umsg#Malloc device memory failed, size[" << size << "], ret[" << ret + << "], " + << "Device " << device_id << " Available MOC size:" << total << " free size:" << free_size + << " may be other processes occupying this card, check as: ps -ef|grep python"; + } else { + LOG_ERROR << "rtMalloc mem size[" << size << "] fail, ret[" << ret << "]"; + } + } else { + LOG_OUT << "Call rtMalloc to allocate device memory Success, size: " << size + << " bytes, address start: " << reinterpret_cast(ptr) + << " end: " << reinterpret_cast(ptr + size); + } + return ptr; +} + +bool AscendMemAdapter::FreeToRts(void *devPtr, const size_t size) const { + if (devPtr != nullptr) { + if (AscendGmemAdapter::GetInstance().is_eager_free_enabled()) { + return AscendGmemAdapter::GetInstance().MunmapMemory(devPtr, size); + } + auto ret = CALL_ASCEND_API(aclrtFree, devPtr); + if (ret != ACL_SUCCESS) { + LOG_ERROR << "aclrtFree mem [" << devPtr << "] fail, ret[" << ret << "]"; + return false; + } + } + return true; +} +} // namespace ascend +} // namespace device +} // namespace mindspore diff --git a/inferrt/src/hardware/ascend/res_manager/mem_manager/ascend_memory_adapter.h b/inferrt/src/hardware/ascend/res_manager/mem_manager/ascend_memory_adapter.h new file mode 100644 index 0000000000000000000000000000000000000000..507798b1b693ad46bdd832208a4000d419d57eac --- /dev/null +++ b/inferrt/src/hardware/ascend/res_manager/mem_manager/ascend_memory_adapter.h @@ -0,0 +1,106 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_ASCEND_MEMORY_ADAPTER_H_ +#define MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_ASCEND_MEMORY_ADAPTER_H_ + +#include +#include +#include +#include +#include +#include + +#include "common/common.h" +#include "hardware/hardware_abstract/visible.h" + +namespace mindspore { +namespace device { +namespace ascend { +struct MemoryBlock { + MemoryBlock(void *ptr, const size_t size, const std::string &tag) { + mem_ptr = ptr; + mem_size = size; + mem_tag = tag; + } + + void *mem_ptr{nullptr}; + size_t mem_size{0}; + std::string mem_tag; +}; + +class AscendMemAdapter; +using AscendMemAdapterPtr = std::shared_ptr; + +class HARDWARE_EXPORT AscendMemAdapter { + public: + virtual ~AscendMemAdapter() = default; + static AscendMemAdapterPtr GetInstance(); + + virtual bool Initialize(); + virtual bool DeInitialize(); + + virtual uint8_t *MallocStaticDevMem(size_t size, const std::string &tag = "") = 0; + virtual bool FreeStaticDevMem(void *addr) = 0; + virtual uint8_t *MallocDynamicDevMem(size_t size, const std::string &tag = "") = 0; + virtual void ResetDynamicMemory() = 0; + virtual std::string DevMemStatistics() const = 0; + virtual size_t GetDynamicMemUpperBound(void *min_static_addr) const = 0; + [[nodiscard]] virtual uint64_t FreeDevMemSize() const = 0; + + virtual void SimulationInitialize(); + + int64_t GetActualPeakMemory() const { return actual_peak_memory_; } + int64_t GetUsedPeakMemory() const { return used_peak_memory_; } + void UpdateActualPeakMemory(int64_t memory) { actual_peak_memory_ = std::max(actual_peak_memory_, memory); } + void UpdateUsedPeakMemory(int64_t memory) { used_peak_memory_ = std::max(used_peak_memory_, memory); } + [[nodiscard]] uint64_t MaxHbmSizeForMs() const { return max_available_ms_hbm_size_; } + [[nodiscard]] int64_t GetMsUsedHbmSize() const { return ms_used_hbm_size_; } + static size_t GetRoundUpAlignSize(size_t input_size); + static size_t GetRoundDownAlignSize(size_t input_size); + + protected: + AscendMemAdapter() = default; + uint8_t *MallocFromRts(size_t size) const; + bool FreeToRts(void *devPtr, const size_t size) const; + + bool initialized_{false}; + // Support multi-thread. + std::mutex mutex_; + + // Actual peak memory usage (with fragments) + int64_t actual_peak_memory_{0}; + // Used peak memory usage (without fragments) + int64_t used_peak_memory_{0}; + + // rts Memory INFO + size_t device_hbm_total_size_{0}; + size_t device_hbm_free_size_{0}; + size_t device_hbm_huge_page_reserved_size_{0}; + + int64_t ms_used_hbm_size_{0}; + int64_t max_available_ms_hbm_size_{0}; + + private: + DISABLE_COPY_AND_ASSIGN(AscendMemAdapter) + size_t GetDeviceMemSizeFromContext() const; + static AscendMemAdapterPtr instance_; +}; +} // namespace ascend +} // namespace device +} // namespace mindspore + +#endif // MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_ASCEND_MEMORY_ADAPTER_H_ diff --git a/inferrt/src/hardware/ascend/res_manager/mem_manager/ascend_memory_manager.cc b/inferrt/src/hardware/ascend/res_manager/mem_manager/ascend_memory_manager.cc new file mode 100644 index 0000000000000000000000000000000000000000..18f41585266f45b65b1b2e881a33e1a692eb1e77 --- /dev/null +++ b/inferrt/src/hardware/ascend/res_manager/mem_manager/ascend_memory_manager.cc @@ -0,0 +1,191 @@ +/** + * Copyright 2019 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "hardware/ascend/res_manager/mem_manager/ascend_memory_manager.h" + +#include +#include +#include +#include +#include + +#include "hardware/ascend/res_manager/mem_manager/ascend_memory_adapter.h" +#include "hardware/ascend/res_manager/ascend_stream_manager.h" +#include "hardware/ascend/res_manager/symbol_interface/acl_rt_symbol.h" +#include "hardware/ascend/res_manager/symbol_interface/symbol_utils.h" +#include "common/common.h" + +namespace mindspore { +namespace device { +namespace ascend { +void AscendMemoryManager::Initialize() { (void)AscendMemAdapter::GetInstance()->Initialize(); } + +void AscendMemoryManager::Finalize() { + AscendMemoryPool::GetInstance().ReleaseDeviceRes(); + (void)AscendMemAdapter::GetInstance()->DeInitialize(); +} + +void AscendMemoryManager::ResetDynamicMemory() { AscendMemAdapter::GetInstance()->ResetDynamicMemory(); } + +void AscendMemoryManager::ClearGlobalIdleMem() { AscendMemoryPool::GetInstance().ResetIdleMemBuf(); } + +uint64_t AscendMemoryManager::GetMsMaxMemSize() const { return AscendMemAdapter::GetInstance()->MaxHbmSizeForMs(); } + +uint64_t AscendMemoryManager::GetMsUsedHbmSize() const { return AscendMemAdapter::GetInstance()->GetMsUsedHbmSize(); } + +void *AscendMemoryManager::MallocMemFromMemPool(size_t size, bool from_persistent_mem, bool need_recycle, + uint32_t stream_id) { + auto align_size = GetCommonAlignSize(size); + return AscendMemoryPool::GetInstance().AllocTensorMem(align_size, from_persistent_mem, need_recycle, stream_id); +} + +void AscendMemoryManager::FreeMemFromMemPool(void *device_ptr) { + AscendMemoryPool::GetInstance().FreeTensorMem(device_ptr); +} + +size_t AscendMemoryManager::GetMaxUsedMemorySize() const { return AscendMemoryPool::GetInstance().GetMaxUsedMemSize(); } + +// Relevant function to manage memory statistics +size_t AscendMemoryManager::GetTotalMemStatistics() const { + return AscendMemoryPool::GetInstance().TotalMemStatistics(); +} +size_t AscendMemoryManager::GetTotalUsedMemStatistics() const { + return AscendMemoryPool::GetInstance().TotalUsedMemStatistics(); +} +size_t AscendMemoryManager::GetTotalIdleMemStatistics() const { + return AscendMemoryPool::GetInstance().TotalIdleMemStatistics(); +} +size_t AscendMemoryManager::GetTotalEagerFreeMemStatistics() const { + return AscendMemoryPool::GetInstance().TotalEagerFreeMemStatistics(); +} +size_t AscendMemoryManager::GetUsedMemPeakStatistics() const { + return AscendMemoryPool::GetInstance().MaxMemAllocatedStatistics(); +} +size_t AscendMemoryManager::GetReservedMemPeakStatistics() const { + return AscendMemoryPool::GetInstance().MaxMemReservedStatistics(); +} +std::unordered_map AscendMemoryManager::GetBlockCountsStatistics() const { + return AscendMemoryPool::GetInstance().BlockCountsStatistics(); +} +std::unordered_map AscendMemoryManager::GetBlockUnitSizeStatistics() const { + return AscendMemoryPool::GetInstance().BlockUnitSizeStatistics(); +} +std::unordered_map> +AscendMemoryManager::GetCommonMemBlocksInfoStatistics() const { + return AscendMemoryPool::GetInstance().CommonMemBlocksInfoStatistics(); +} +std::unordered_map> +AscendMemoryManager::GetPersistentMemBlocksInfoStatistics() const { + return AscendMemoryPool::GetInstance().PersistentMemBlocksInfoStatistics(); +} +void AscendMemoryManager::ResetMaxMemoryReserved() { AscendMemoryPool::GetInstance().ResetMaxMemReserved(); } +void AscendMemoryManager::ResetMaxMemoryAllocated() { AscendMemoryPool::GetInstance().ResetMaxMemAllocated(); } +size_t AscendMemoryManager::EmptyCache() { return AscendMemoryPool::GetInstance().EmptyCache(); } + +uint8_t *AscendMemoryManager::MallocStaticMem(size_t size, bool communication_mem, uint32_t graph_id) { + size_t align_size = 0; + if (communication_mem) { + align_size = GetCommunicationAlignSize(size); + } else { + align_size = GetCommonAlignSize(size); + } + LOG_OUT << "Malloc Memory for Static: size[" << align_size << "] communication_mem:" << communication_mem; + + uint8_t *alloc_address = reinterpret_cast(AscendMemoryPool::GetInstance().AllocTensorMem(align_size)); + if (alloc_address != nullptr) { + // create protect area [kMemAlignSize -- data -- kMemAlignSize] for communication node memory + return communication_mem ? alloc_address + kMemAlignSize : alloc_address; + } + LOG_ERROR << "#umsg#Framework Error Message:#umsg#Fail to alloc memory, size: " << align_size + << "B, memory statistics:" << AscendMemAdapter::GetInstance()->DevMemStatistics(); + return 0; +} + +uint8_t *AscendMemoryManager::MallocDynamicMem(size_t size, bool communication_mem) { + size_t align_size = 0; + if (communication_mem) { + align_size = GetCommunicationAlignSize(size); + } else { + align_size = GetCommonAlignSize(size); + } + LOG_OUT << "Malloc Memory for Dynamic: size[" << align_size << "] communication_mem: " << communication_mem; + + uint8_t *alloc_address = + reinterpret_cast(AscendMemAdapter::GetInstance()->MallocDynamicDevMem(align_size)); + CHECK_IF_NULL(alloc_address); + // create protect area [kMemAlignSize -- data -- kMemAlignSize] for communication node memory + return communication_mem ? alloc_address + kMemAlignSize : alloc_address; +} + +size_t AscendMemoryManager::GetAvailableMemSize() { + auto available_mem_size = AscendMemoryPool::GetInstance().free_mem_size() + + AscendMemoryPool::GetInstance().TotalMemStatistics() - + AscendMemoryPool::GetInstance().TotalUsedMemStatistics(); + return available_mem_size; +} + +DynamicMemPool *AscendMemoryManager::GetMemoryPool() { + if (MS_UNLIKELY(memory_pool_ == nullptr)) { + memory_pool_ = &(AscendMemoryPool::GetInstance()); + } + return memory_pool_; +} + +void EnhancedAscendMemoryManager::Initialize() { + AscendMemoryManager::Initialize(); + LOG_OUT << "EnhancedAscendMemoryManager initialize."; + alloc_costs_.clear(); +} + +void EnhancedAscendMemoryManager::Finalize() { + AscendMemoryManager::Finalize(); + LOG_OUT << "EnhancedAscendMemoryManager finalize"; + std::sort(alloc_costs_.begin(), alloc_costs_.end()); + // Calculate mean and median, then print them. + auto total_size = alloc_costs_.size(); + if (total_size == 0) { + LOG_OUT << "No memory operation."; + return; + } + double median = 0; + if (total_size & 1) { + median = (alloc_costs_[total_size >> 1] + alloc_costs_[(total_size >> 1) + 1]) >> 1; + } else { + median = alloc_costs_[total_size >> 1]; + } + LOG_OUT << "EnhancedAscendMemoryManager median : " << median << "ns."; + + double sum = std::accumulate(alloc_costs_.begin(), alloc_costs_.end(), 0.0); + double mean = sum / total_size; + LOG_OUT << "EnhancedAscendMemoryManager mean : " << mean << "ns."; + + const double cost_high_water = 1800; + if (median > cost_high_water || mean > cost_high_water) { + LOG_OUT << "EnhancedAscendMemoryManager check failed, median : " << median << ", mean : " << mean; + } +} + +void *EnhancedAscendMemoryManager::MallocMemFromMemPool(size_t size, bool from_persistent_mem, bool need_recycle, + uint32_t stream_id) { + auto start_tick = GetCurrentTick(); + auto ret = AscendMemoryManager::MallocMemFromMemPool(size, from_persistent_mem, need_recycle, stream_id); + auto cost = GetCurrentTick() - start_tick; + (void)alloc_costs_.emplace_back(cost); + LOG_OUT << "Malloc memory cost : " << cost << "ns."; + return ret; +} +} // namespace ascend +} // namespace device +} // namespace mindspore diff --git a/inferrt/src/hardware/ascend/res_manager/mem_manager/ascend_memory_manager.h b/inferrt/src/hardware/ascend/res_manager/mem_manager/ascend_memory_manager.h new file mode 100644 index 0000000000000000000000000000000000000000..38cdba324b422f88e6031cb822dc35212a9f64f2 --- /dev/null +++ b/inferrt/src/hardware/ascend/res_manager/mem_manager/ascend_memory_manager.h @@ -0,0 +1,99 @@ +/** + * Copyright 2019 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_ASCEND_MEMORY_MANAGER_H_ +#define MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_ASCEND_MEMORY_MANAGER_H_ + +#include +#include +#include + +#include +#include "hardware/hardware_abstract/memory_manager.h" +#include "hardware/ascend/res_manager/mem_manager/ascend_memory_pool.h" + +namespace mindspore { +namespace device { +namespace ascend { +class HARDWARE_EXPORT AscendMemoryManager : public MemoryManager { + public: + AscendMemoryManager() = default; + ~AscendMemoryManager() override = default; + + void Initialize() override; + void Finalize() override; + void ResetDynamicMemory() override; + void ClearGlobalIdleMem() override; + void *MallocMemFromMemPool(size_t size, bool from_persistent_mem, bool need_recycle = false, + uint32_t stream_id = kDefaultStreamIndex) override; + void FreeMemFromMemPool(void *device_ptr) override; + size_t GetMaxUsedMemorySize() const override; + uint64_t GetMsMaxMemSize() const; + std::vector MallocContinuousMemFromMemPool(const std::vector &size_list, + uint32_t stream_id = kDefaultStreamIndex) override { + return AscendMemoryPool::GetInstance().AllocContinuousTensorMem(size_list, stream_id); + } + + size_t GetAvailableMemSize() override; + uint64_t GetMsUsedHbmSize() const; + + // Relevant function to manage memory statistics + size_t GetTotalMemStatistics() const override; + size_t GetTotalUsedMemStatistics() const override; + size_t GetTotalIdleMemStatistics() const override; + size_t GetTotalEagerFreeMemStatistics() const override; + size_t GetUsedMemPeakStatistics() const override; + size_t GetReservedMemPeakStatistics() const override; + std::unordered_map GetBlockCountsStatistics() const override; + std::unordered_map GetBlockUnitSizeStatistics() const override; + std::unordered_map> GetCommonMemBlocksInfoStatistics() + const override; + std::unordered_map> + GetPersistentMemBlocksInfoStatistics() const override; + void ResetMaxMemoryReserved() override; + void ResetMaxMemoryAllocated() override; + size_t EmptyCache() override; + + DynamicMemPool *GetMemoryPool() override; + + protected: + uint8_t *MallocStaticMem(size_t size, bool communication_mem, uint32_t graph_id) override; + uint8_t *MallocDynamicMem(size_t size, bool communication_mem) override; +}; + +class HARDWARE_EXPORT EnhancedAscendMemoryManager : public AscendMemoryManager { + public: + EnhancedAscendMemoryManager() = default; + ~EnhancedAscendMemoryManager() override = default; + + void Initialize() override; + + void Finalize() override; + + void *MallocMemFromMemPool(size_t size, bool from_persistent_mem, bool need_recycle, uint32_t stream_id) override; + + private: + inline uint64_t GetCurrentTick() { + auto &&ts = std::chrono::system_clock::now(); + return static_cast(std::chrono::duration_cast(ts.time_since_epoch()).count()); + } + + std::vector alloc_costs_; +}; +} // namespace ascend +} // namespace device +} // namespace mindspore +#endif // MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_ASCEND_MEMORY_MANAGER_H_ diff --git a/inferrt/src/hardware/ascend/res_manager/mem_manager/ascend_memory_pool.cc b/inferrt/src/hardware/ascend/res_manager/mem_manager/ascend_memory_pool.cc new file mode 100644 index 0000000000000000000000000000000000000000..c47013b143988d02907d5692668de98b93d634c9 --- /dev/null +++ b/inferrt/src/hardware/ascend/res_manager/mem_manager/ascend_memory_pool.cc @@ -0,0 +1,331 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "hardware/ascend/res_manager/mem_manager/ascend_memory_pool.h" + +#include +#include +#include + +#include +#include + +#include "common/common.h" +#include "hardware/ascend/res_manager/mem_manager/ascend_vmm_adapter.h" +#include "hardware/ascend/res_manager/ascend_stream_manager.h" + +namespace mindspore { +namespace device { +namespace ascend { +constexpr size_t kByteOffset = 8; +constexpr uint32_t kDefaultDispatchThreadsNum = 5; +constexpr uint32_t kDefaultOpThreadsNum = 25; +constexpr float kDefaultMemInitSize = 2.0; +constexpr float kDefaultMemBlockIncreaseSize = 1.0; +constexpr float kDefaultMemMaxSize = 1024.0; + +DefaultAscendMemoryPool::DefaultAscendMemoryPool() { + LOG_OUT << "DefaultAscendMemoryPool constructed."; + SetEnableVmm(AscendVmmAdapter::GetInstance().IsEnabled()); +} + +size_t DefaultAscendMemoryPool::EmptyCache() { + LockGuard lock(AbstractDynamicMemPool::lock()); + AbstractEnhancedDynamicMemPool::WaitPipelineHelper(); + AbstractAscendMemoryPoolSupport::SyncAllStreams(); + size_t release_free_size = 0; + if (MS_UNLIKELY(!customized_allocators_.empty())) { + release_free_size += ReleaseCustomFreeBlocks(); + } + if (IsEnableVmm()) { + AbstractEnhancedDynamicMemPool::FreeIdleMemsByEagerFree(); + release_free_size += AbstractAscendMemoryPoolSupport::EmptyCache(); + return release_free_size; + } else if (IsEnableEagerFree()) { + auto ret = AbstractEnhancedDynamicMemPool::FreeIdleMemsByEagerFree(); + LOG_OUT << "Eager free memory size is " << ret.second << "."; + release_free_size += ret.second; + return release_free_size; + } + + LOG_OUT << "Vmm is not enabled, try to release free blocks."; + // // disable ge kernel use two pointer mem adapter, not support free. + // if (IsDisableGeKernel()) { + // return 0L; + // } + release_free_size += ReleaseFreeBlocks(); + return release_free_size; +} + +void DefaultAscendMemoryPool::EnablePluggableAllocator(std::function alloc_fn, + std::function free_fn) { + custom_alloc_fn_ = alloc_fn; + custom_free_fn_ = free_fn; + enable_custom_allocator_ = true; +} + +void DefaultAscendMemoryPool::DisablePluggableAllocator() { + enable_custom_allocator_ = false; + return; +} + +DefaultEnhancedAscendMemoryPool::DefaultEnhancedAscendMemoryPool(const DefaultAscendMemoryPoolPtr &instance) + : instance_(instance) { + LOG_OUT << "DefaultEnhancedAscendMemoryPool constructed."; + instance_->SetEnableVmm(AscendVmmAdapter::GetInstance().IsEnabled()); +} + +void DefaultEnhancedAscendMemoryPool::ReleaseDeviceRes() { + LOG_OUT << "Start release device res."; + instance_->ReleaseDeviceRes(); +} + +DeviceMemPtr DefaultEnhancedAscendMemoryPool::AllocTensorMem(size_t size, bool from_persistent_mem, bool need_recycle, + uint32_t stream_id) { + size_t align_size = AlignMemorySize(size); + LOG_OUT << "Allocate tensor mem, size : " << size << ", align_size : " << align_size + << ", need_recycle : " << need_recycle << "."; + LockGuard lock(instance_->lock()); + const auto [mem_buf, allocator] = instance_->AllocMemBuf(align_size, from_persistent_mem, stream_id); + if (mem_buf == nullptr) { + LOG_OUT << "Allocate tensor mem, return nullptr."; + // Dump mem pool state info and debug info when alloc tensor failed. + DumpDynamicMemPoolStateInfo(); + DumpDynamicMemPoolDebugInfo(); + return nullptr; + } + + mem_buf->SetDebugInfo(); + instance_->addr_mem_buf_allocators().emplace(mem_buf->addr_, std::make_pair(mem_buf, allocator)); + auto device_addr = mem_buf->addr_; + + instance_->ReportMemoryPoolInfo(); + instance_->ReportMemoryPoolMallocInfoToMstx(device_addr, align_size); + + LOG_OUT << "Allocate tensor mem, return : " << mem_buf->ToJson() << ", stat info : " << instance_->mem_stat().ToJson() + << "."; + return device_addr; +} + +std::vector DefaultEnhancedAscendMemoryPool::AllocContinuousTensorMem( + const std::vector &size_list, uint32_t stream_id) { + LOG_OUT << "Alloc continuous tensor mem, stream id : " << stream_id << "."; + const auto &continuous_addrs = instance_->AllocContinuousTensorMem(size_list, stream_id); + if (continuous_addrs.size() != size_list.size()) { + return continuous_addrs; + } + if (continuous_addrs.size() == 1 && continuous_addrs[0] == nullptr) { + return continuous_addrs; + } + return continuous_addrs; +} + +void DefaultEnhancedAscendMemoryPool::FreeTensorMem(const DeviceMemPtr &device_addr) { + LOG_OUT << "Free tensor mem, device addr : " << device_addr << "."; + LockGuard lock(instance_->lock()); + DoFreeTensorMem(device_addr); +} + +bool DefaultEnhancedAscendMemoryPool::DoFreeTensorMem(const DeviceMemPtr &device_addr) { + void *enhanced_device_addr = device_addr; + bool ret = instance_->DoFreeTensorMem(device_addr); + LOG_OUT << "Do free tensor mem : " << enhanced_device_addr << ", return : " << ret << "."; + return ret; +} + +void DefaultEnhancedAscendMemoryPool::FreePartTensorMems(const std::vector &free_addrs, + const std::vector &keep_addrs, + const std::vector &keep_addr_sizes) { + LOG_OUT << "Free part tensor mems."; + LockGuard lock(instance_->lock()); + + const auto keep_mem_bufs = instance_->DoFreePartTensorMems(free_addrs, keep_addrs, keep_addr_sizes); +} + +void DefaultEnhancedAscendMemoryPool::DefragMemory() { + if (last_vmm_used_size_ == 0) { + last_vmm_used_size_ = GetVmmUsedMemSize(); + } else { + size_t vmm_used_size = GetVmmUsedMemSize(); + if (vmm_used_size > last_vmm_used_size_) { + LOG_OUT << "Current vmm used size : " << vmm_used_size + << " is bigger than last vmm used size : " << last_vmm_used_size_ << "."; + last_vmm_used_size_ = vmm_used_size; + } + } + + instance_->DefragMemory(); +} + +void DefaultEnhancedAscendMemoryPool::DumpDynamicMemPoolStateInfo() { instance_->DumpDynamicMemPoolStateInfo(); } + +const std::pair DefaultEnhancedAscendMemoryPool::FreeIdleMemsByEagerFree() { + const auto [eager_free_size, real_free_size] = instance_->FreeIdleMemsByEagerFree(); + return {eager_free_size, real_free_size}; +} + +bool DefaultEnhancedAscendMemoryPool::WaitEvent(int64_t task_id_on_stream, uint32_t user_stream_id, + uint32_t memory_stream_id) { + LockGuard lock(instance_->lock()); + auto key = std::make_pair(user_stream_id, memory_stream_id); + auto iter = instance_->stream_pair_mem_bufs().find(key); + if (iter == instance_->stream_pair_mem_bufs().end()) { + return false; + } + + auto mem_bufs_ = iter->second; + for (const auto &mem_buf : mem_bufs_) { + LOG_OUT << "Wait event for : " << mem_buf->ToJson() << "."; + mem_buf->WaitEvent(task_id_on_stream, user_stream_id); + // Remove event and try to free memory. + if (mem_buf->IsEventNotUsed()) { + instance_->mem_stat().used_by_event_size_ -= mem_buf->size_; + // Force clear all mem bufs. + for (auto &stream_pair_mem_bufs : instance_->stream_pair_mem_bufs()) { + (void)stream_pair_mem_bufs.second.erase(mem_buf); + } + if (mem_buf->status_ == DynamicMemBufStatus::kMemBufUsedByEvent) { + (void)DoFreeTensorMem(mem_buf->addr_); + } + } + } + return true; +} + +bool DefaultEnhancedAscendMemoryPool::WaitEvent(int64_t task_id_on_stream, uint32_t memory_stream_id) { + LockGuard lock(instance_->lock()); + for (auto &stream_pair_mem_bufs : instance_->stream_pair_mem_bufs()) { + const auto &[user_stream, memory_stream] = stream_pair_mem_bufs.first; + if (memory_stream != memory_stream_id) { + continue; + } + auto mem_bufs = stream_pair_mem_bufs.second; + for (const auto &mem_buf : mem_bufs) { + LOG_OUT << "Wait event for : " << mem_buf->ToJson() << "."; + mem_buf->WaitEvent(task_id_on_stream, user_stream); + // Remove event and try to free memory. + if (mem_buf->IsEventNotUsed()) { + instance_->mem_stat().used_by_event_size_ -= mem_buf->size_; + // Force clear all mem bufs. + for (auto &kv : instance_->stream_pair_mem_bufs()) { + (void)kv.second.erase(mem_buf); + } + if (mem_buf->status_ == DynamicMemBufStatus::kMemBufUsedByEvent) { + (void)DoFreeTensorMem(mem_buf->addr_); + } + } + } + } + return true; +} + +bool DefaultEnhancedAscendMemoryPool::SyncAllEvents() { + LockGuard lock(instance_->lock()); + if (stream_pair_mem_bufs().empty()) { + return false; + } + + std::set carry_event_mem_bufs; + for (const auto &stream_pair_mem_buf : instance_->stream_pair_mem_bufs()) { + for (const auto &mem_buf : stream_pair_mem_buf.second) { + (void)carry_event_mem_bufs.emplace(mem_buf); + } + } + for (auto &mem_buf : carry_event_mem_bufs) { + if (mem_buf->SyncAllEvents() && mem_buf->status_ == DynamicMemBufStatus::kMemBufUsedByEvent) { + (void)DoFreeTensorMem(mem_buf->addr_); + } + } + + instance_->stream_pair_mem_bufs().clear(); + return true; +} + +void DefaultEnhancedAscendMemoryPool::SetRankIdGetter(const std::function &rank_id_getter) { + instance_->SetRankIdGetter(rank_id_getter); + if (rank_id_getter != nullptr) { + rank_id_getter_ = rank_id_getter; + } +} + +BestFitAscendMemoryPool::BestFitAscendMemoryPool() { + LOG_OUT << "BestFitAscendMemoryPool constructed, older memory allocator is enabled."; + SetEnableVmm(AscendVmmAdapter::GetInstance().IsEnabled()); +} + +size_t BestFitAscendMemoryPool::EmptyCache() { + LOG_OUT << "Best fit memory pool is not supported empty cache."; + return 0L; +} + +// Initialize static member in AscendMemoryPool. +AbstractAscendMemoryPoolSupportPtr AscendMemoryPool::pool_ = nullptr; + +AbstractAscendMemoryPoolSupportPtr AscendMemoryPool::instance_ = nullptr; + +AbstractAscendMemoryPoolSupportPtr AscendMemoryPool::enhanced_instance_ = nullptr; + +AbstractAscendMemoryPoolSupport &AscendMemoryPool::GetInstance() { + static std::once_flag flag; + std::call_once(flag, [&]() { + if (UseOldMemoryPool()) { + instance_ = std::make_shared(); + enhanced_instance_ = instance_; + } else { + const auto &memory_pool = std::make_shared(); + instance_ = memory_pool; + enhanced_instance_ = std::make_shared(memory_pool); + } + // Initialize instance and set ptr. + float init_size = kDefaultMemInitSize; + size_t init_size_byte = FloatToSize(init_size * kGBToByte); + float increase_size = kDefaultMemBlockIncreaseSize; + size_t increase_size_byte = FloatToSize(increase_size * kGBToByte); + float max_size = kDefaultMemMaxSize; + size_t max_size_byte = FloatToSize(max_size * kGBToByte); + instance_->Initialize(init_size_byte, increase_size_byte, max_size_byte); + // Set memory mstx callback func. + if (!UseEnhancedMemoryPool()) { + pool_ = instance_; + } else { + pool_ = enhanced_instance_; + } + }); + return *pool_; +} + +void AscendMemoryPool::SetEnhancedMemoryPool(bool enable) { + LOG_OUT << "Set enhanced memory pool : " << enable << "."; + if (enable) { + pool_ = enhanced_instance_; + } else { + pool_ = instance_; + } +} + +bool AscendMemoryPool::UseOldMemoryPool() { + return false; + // if (memory::mem_pool::IsDisableAllocConfig(memory::mem_pool::kAllocMemoryPool)) { + // return false; + // } + // return IsDisableGeKernel() || memory::mem_pool::IsEnableAllocConfig(memory::mem_pool::kAllocMemoryPool); +} + +// Use enhanced memory pool when enable debug, enable log, enable prof, dry run and so on. +bool AscendMemoryPool::UseEnhancedMemoryPool() { return false; } +} // namespace ascend +} // namespace device +} // namespace mindspore diff --git a/inferrt/src/hardware/ascend/res_manager/mem_manager/ascend_memory_pool.h b/inferrt/src/hardware/ascend/res_manager/mem_manager/ascend_memory_pool.h new file mode 100644 index 0000000000000000000000000000000000000000..cbe3b44500e9e54d26faa1bb983dbc03720c1b0c --- /dev/null +++ b/inferrt/src/hardware/ascend/res_manager/mem_manager/ascend_memory_pool.h @@ -0,0 +1,300 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_ASCEND_MEMORY_POOL_H_ +#define MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_ASCEND_MEMORY_POOL_H_ + +#include +#include +#include +#include +#include + +#include "hardware/hardware_abstract/memory/abstract_dynamic_mem_pool.h" +#include "hardware/hardware_abstract/visible.h" +#include "hardware/ascend/res_manager/mem_manager/abstract_ascend_memory_pool_support.h" + +namespace mindspore { +namespace device { +namespace ascend { + +class HARDWARE_EXPORT DefaultAscendMemoryPool : public AbstractAscendMemoryPoolSupport, + public AbstractEnhancedDynamicMemPool { + public: + DefaultAscendMemoryPool(); + DefaultAscendMemoryPool(const DefaultAscendMemoryPool &) = delete; + DefaultAscendMemoryPool &operator=(const DefaultAscendMemoryPool &) = delete; + ~DefaultAscendMemoryPool() override = default; + + std::string GetMemoryPoolType() const override { return "DefaultAscendMemoryPool"; } + + void SetMemPoolBlockSize(size_t available_device_mem_size) override { + return AbstractAscendMemoryPoolSupport::SetMemPoolBlockSize(available_device_mem_size); + } + + size_t CalMemBlockAllocSize(size_t size, bool from_persistent_mem, bool need_recycle = false) override { + return AbstractAscendMemoryPoolSupport::CalMemBlockAllocSize(size, from_persistent_mem, need_recycle); + } + + const bool IsEnableEagerFree() const override { return AbstractAscendMemoryPoolSupport::IsEnableEagerFree(); } + + size_t EmptyCache() override; + + void EnablePluggableAllocator(std::function alloc_fn, std::function free_fn) override; + void DisablePluggableAllocator() override; +}; +using DefaultAscendMemoryPoolPtr = std::shared_ptr; + +class HARDWARE_EXPORT DefaultEnhancedAscendMemoryPool : public DefaultAscendMemoryPool { + public: + explicit DefaultEnhancedAscendMemoryPool(const DefaultAscendMemoryPoolPtr &instance); + DefaultEnhancedAscendMemoryPool(const DefaultEnhancedAscendMemoryPool &) = delete; + DefaultEnhancedAscendMemoryPool &operator=(const DefaultEnhancedAscendMemoryPool &) = delete; + ~DefaultEnhancedAscendMemoryPool() override = default; + + // Wrap enhanced function. + void Initialize(size_t init_size, size_t increase_size, size_t max_size) override { + instance_->Initialize(init_size, increase_size, max_size); + } + + void ReleaseDeviceRes() override; + + DeviceMemPtr AllocTensorMem(size_t size, bool from_persistent_mem = false, bool need_recycle = false, + uint32_t stream_id = kDefaultStreamIndex) override; + + std::vector AllocContinuousTensorMem(const std::vector &size_list, + uint32_t stream_id = kDefaultStreamIndex) override; + + void FreeTensorMem(const DeviceMemPtr &device_addr) override; + + bool DoFreeTensorMem(const DeviceMemPtr &device_addr) override; + + void FreePartTensorMems(const std::vector &free_addrs, const std::vector &keep_addrs, + const std::vector &keep_addr_sizes) override; + + std::vector DoFreePartTensorMems(const std::vector &free_addrs, + const std::vector &keep_addrs, + const std::vector &keep_addr_sizes) override { + return instance_->DoFreePartTensorMems(free_addrs, keep_addrs, keep_addr_sizes); + } + + void DefragMemory() override; + + void DumpDynamicMemPoolStateInfo() override; + + const std::pair FreeIdleMemsByEagerFree() override; + + size_t ReleaseFreeBlocks() override { return instance_->ReleaseFreeBlocks(); } + + // Proxy wrapper for AbstractAscendMemoryPoolSupport + void ResetIdleMemBuf() const override { instance_->ResetIdleMemBuf(); } + + bool RecordEvent(int64_t task_id_on_stream, uint32_t user_stream_id, + const std::vector> &memory_stream_addresses, + const DeviceEventPtr &event) override { + return instance_->RecordEvent(task_id_on_stream, user_stream_id, memory_stream_addresses, event); + } + + bool WaitEvent(int64_t task_id_on_stream, uint32_t user_stream_id, uint32_t memory_stream_id) override; + + bool WaitEvent(int64_t task_id_on_stream, uint32_t memory_stream_id) override; + + bool SyncAllEvents() override; + + void EnablePluggableAllocator(std::function alloc_fn, std::function free_fn) override { + return instance_->EnablePluggableAllocator(alloc_fn, free_fn); + } + + void DisablePluggableAllocator() override { return instance_->DisablePluggableAllocator(); } + + size_t AlignMemorySize(size_t size) const override { return instance_->AlignMemorySize(size); } + + size_t CalMemBlockAllocSize(size_t size, bool from_persistent_mem, bool need_recycle = false) override { + return instance_->CalMemBlockAllocSize(size, from_persistent_mem, need_recycle); + } + + void SetMemPoolBlockSize(size_t available_device_mem_size) override { + instance_->SetMemPoolBlockSize(available_device_mem_size); + } + + size_t MemAllocUnitSize(bool from_persistent_mem) const override { + return instance_->MemAllocUnitSize(from_persistent_mem); + } + + void SetMemAllocUintSize(size_t common_size, size_t persist_size = kDynamicMemAllocUnitSize) override { + instance_->SetMemAllocUintSize(common_size, persist_size); + } + + void *GetMinUsingMemoryAddr() const override { return instance_->GetMinUsingMemoryAddr(); } + + size_t AllocDeviceMem(size_t size, DeviceMemPtr *addr) override { return instance_->AllocDeviceMem(size, addr); } + + bool FreeDeviceMem(const DeviceMemPtr &addr) override { return instance_->FreeDeviceMem(addr); } + + size_t free_mem_size() override { return instance_->free_mem_size(); } + + uint64_t total_mem_size() const override { return instance_->total_mem_size(); } + + size_t GetMaxUsedMemSize() const override { return instance_->GetMaxUsedMemSize(); } + + size_t GetVmmUsedMemSize() const override { return instance_->GetVmmUsedMemSize(); } + + void DumpDynamicMemPoolDebugInfo() override { instance_->DumpDynamicMemPoolDebugInfo(); } + + size_t TotalMemStatistics() const override { return instance_->TotalMemStatistics(); } + + size_t TotalUsedMemStatistics() const override { return instance_->TotalUsedMemStatistics(); } + + size_t TotalUsedByEventMemStatistics() const override { return instance_->TotalUsedByEventMemStatistics(); } + + size_t TotalIdleMemStatistics() const override { return instance_->TotalIdleMemStatistics(); } + + size_t TotalEagerFreeMemStatistics() const override { return instance_->TotalEagerFreeMemStatistics(); } + + size_t UsedMemPeakStatistics() const override { return instance_->UsedMemPeakStatistics(); } + + size_t MaxMemAllocatedStatistics() const override { return instance_->MaxMemAllocatedStatistics(); } + + size_t MaxMemReservedStatistics() const override { return instance_->MaxMemReservedStatistics(); } + + size_t ActualPeakStatistics() const override { return instance_->ActualPeakStatistics(); } + + std::unordered_map BlockCountsStatistics() const override { + return std::move(instance_->BlockCountsStatistics()); + } + + std::unordered_map BlockUnitSizeStatistics() const override { + return std::move(instance_->BlockUnitSizeStatistics()); + } + + std::unordered_map> CommonMemBlocksInfoStatistics() + const override { + return std::move(instance_->CommonMemBlocksInfoStatistics()); + } + + std::unordered_map> PersistentMemBlocksInfoStatistics() + const override { + return std::move(instance_->PersistentMemBlocksInfoStatistics()); + } + + void ResetMaxMemReserved() override { instance_->ResetMaxMemReserved(); } + + void ResetMaxMemAllocated() override { instance_->ResetMaxMemAllocated(); } + + const bool IsEnableEagerFree() const override { return instance_->IsEnableEagerFree(); } + + const bool IsEnableVmm() const override { return instance_->IsEnableVmm(); } + + void SetEnableVmm(bool enable_vmm) override { instance_->SetEnableVmm(enable_vmm); } + + const bool SyncAllStreams() override { return instance_->SyncAllStreams(); } + + size_t AllocDeviceMemByEagerFree(size_t size, DeviceMemPtr *addr) override { + return instance_->AllocDeviceMemByEagerFree(size, addr); + } + + size_t FreeDeviceMemByEagerFree(const DeviceMemPtr addr, const size_t size) override { + return instance_->FreeDeviceMemByEagerFree(addr, size); + } + + size_t MmapDeviceMem(size_t size, DeviceMemPtr addr) override { return instance_->MmapDeviceMem(size, addr); } + + std::string GetMemoryPoolType() const override { return "DefaultEnhancedAscendMemoryPool"; } + + void ReportMemoryPoolInfo() override { instance_->ReportMemoryPoolInfo(); } + + void ReportMemoryPoolMallocInfoToMstx(void *ptr, size_t size) override { + instance_->ReportMemoryPoolMallocInfoToMstx(ptr, size); + } + + void ReportMemoryPoolFreeInfoToMstx(void *ptr) override { instance_->ReportMemoryPoolFreeInfoToMstx(ptr); } + + bool IsEnableTimeEvent() override { return instance_->IsEnableTimeEvent(); } + + void SetEnableTimeEvent(bool enable_time_event) override { instance_->SetEnableTimeEvent(enable_time_event); } + + MemoryTimeEventPtr GenAllocateMemoryTimeEvent(const void *addr, size_t size, uint32_t stream_id, bool from_persistent, + bool is_persistent) override { + return instance_->GenAllocateMemoryTimeEvent(addr, size, stream_id, from_persistent, is_persistent); + } + + MemoryTimeEventPtr GenFreeMemoryTimeEvent(const void *addr) override { + return instance_->GenFreeMemoryTimeEvent(addr); + } + + size_t EmptyCache() override { return instance_->EmptyCache(); } + + protected: + void SetRankIdGetter(const std::function &rank_id_getter) override; + + private: + DefaultAscendMemoryPoolPtr instance_; + size_t last_vmm_used_size_{0}; +}; + +class HARDWARE_EXPORT BestFitAscendMemoryPool : public AbstractAscendMemoryPoolSupport { + public: + BestFitAscendMemoryPool(); + BestFitAscendMemoryPool(const BestFitAscendMemoryPool &) = delete; + BestFitAscendMemoryPool &operator=(const BestFitAscendMemoryPool &) = delete; + ~BestFitAscendMemoryPool() override = default; + + void SetMemPoolBlockSize(size_t available_device_mem_size) override { + return AbstractAscendMemoryPoolSupport::SetMemPoolBlockSize(available_device_mem_size); + } + + size_t CalMemBlockAllocSize(size_t size, bool from_persistent_mem, bool need_recycle = false) override { + return AbstractAscendMemoryPoolSupport::CalMemBlockAllocSize(size, from_persistent_mem, need_recycle); + } + + const bool IsEnableEagerFree() const override { return AbstractAscendMemoryPoolSupport::IsEnableEagerFree(); } + + std::string GetMemoryPoolType() const override { return "BestFitAscendMemoryPool"; } + + size_t EmptyCache() override; +}; + +class HARDWARE_EXPORT AscendMemoryPool { + public: + AscendMemoryPool(const AscendMemoryPool &) = delete; + AscendMemoryPool &operator=(const AscendMemoryPool &) = delete; + + static AbstractAscendMemoryPoolSupport &GetInstance(); + + static void SetEnhancedMemoryPool(bool enable); + + private: + AscendMemoryPool() {} + + static bool UseOldMemoryPool(); + + // Use enhanced memory pool when enable debug, enable log, enable prof, dry run and so on. + static bool UseEnhancedMemoryPool(); + + // Reference to memory pool. + static AbstractAscendMemoryPoolSupportPtr pool_; + + // Basic memory pool instance with high performance. + static AbstractAscendMemoryPoolSupportPtr instance_; + + // Memory pool support profiling and debugging. + static AbstractAscendMemoryPoolSupportPtr enhanced_instance_; +}; +} // namespace ascend +} // namespace device +} // namespace mindspore + +#endif // MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_ASCEND_MEMORY_POOL_H_ diff --git a/inferrt/src/hardware/ascend/res_manager/mem_manager/ascend_vmm_adapter.cc b/inferrt/src/hardware/ascend/res_manager/mem_manager/ascend_vmm_adapter.cc new file mode 100644 index 0000000000000000000000000000000000000000..104429256b574af256d4a608b69a279ab4217d0e --- /dev/null +++ b/inferrt/src/hardware/ascend/res_manager/mem_manager/ascend_vmm_adapter.cc @@ -0,0 +1,272 @@ +/** + * Copyright 2024 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "hardware/ascend/res_manager/mem_manager/ascend_vmm_adapter.h" +#include +#include +#include + +#include "hardware/ascend/res_manager/symbol_interface/symbol_utils.h" +#include "hardware/ascend/res_manager/symbol_interface/acl_rt_symbol.h" +#include "common/common.h" + +namespace mindspore { +namespace device { +namespace ascend { +size_t AscendVmmAdapter::GetRoundUpAlignSize(size_t input_size) const { + return ((input_size + vmm_align_size_ - 1) / vmm_align_size_) * vmm_align_size_; +} + +size_t AscendVmmAdapter::GetRoundDownAlignSize(size_t input_size) const { + return (input_size / vmm_align_size_) * vmm_align_size_; +} + +size_t AscendVmmAdapter::GetHandleSize(size_t input_size) { + if (input_size % vmm_align_size_ != 0) { + LOG_ERROR << "Input size must be multiple of 2MB, but got " << input_size; + } + return input_size / vmm_align_size_; +} + +DeviceMemPtr AscendVmmAdapter::FindVmmSegment(const DeviceMemPtr addr) { + auto it = vmm_map_.upper_bound(addr); + if (it == vmm_map_.begin()) { + return nullptr; + } else { + --it; + return it->first; + } + return nullptr; +} + +void AscendVmmAdapter::ClearAllMemory() { + for (auto &kv : vmm_map_) { + if (kv.second == nullptr) { + continue; + } + auto ret = CALL_ASCEND_API(aclrtUnmapMem, kv.first); + if (ret != ACL_SUCCESS) { + LOG_ERROR << "Unmap memory failed."; + } + ret = CALL_ASCEND_API(aclrtFreePhysical, kv.second); + if (ret != ACL_SUCCESS) { + LOG_ERROR << "Free physical memory failed."; + } + } + while (!cached_handle_sets_.empty()) { + auto handle = *cached_handle_sets_.begin(); + cached_handle_sets_.erase(cached_handle_sets_.begin()); + auto ret = CALL_ASCEND_API(aclrtFreePhysical, handle); + if (ret != ACL_SUCCESS) { + LOG_ERROR << "Free physical memory failed."; + } + } + for (auto &addr : all_reserve_mems_) { + CALL_ASCEND_API(aclrtReleaseMemAddress, addr); + } + all_reserve_mems_.clear(); + vmm_map_.clear(); +} + +namespace { +void MoveBackMappedHandle(std::map *mapped_vmm_handle, + std::map *vmm_map, + std::set *cached_handle_sets_) { + for (const auto [address, handle] : *mapped_vmm_handle) { + auto ret = CALL_ASCEND_API(aclrtUnmapMem, address); + if (ret != ACL_SUCCESS) { + LOG_ERROR << "Unmap memory failed, address : " << address << "."; + } else { + auto iter = vmm_map->find(address); + if (iter == vmm_map->end()) { + LOG_ERROR << "Find vmm map address : " << address << " failed."; + } else { + iter->second = nullptr; + cached_handle_sets_->insert(handle); + } + } + } +} +}; // namespace + +size_t AscendVmmAdapter::MmapDeviceMem(const size_t size, const DeviceMemPtr addr, const size_t max_size) { + CHECK_IF_NULL(addr); + LOG_OUT << "VMM MmapDeviceMem size:" << size << ", addr:" << addr + << ", cached_handle_sets_ size : " << cached_handle_sets_.size() << "."; + // use 0 temporarily + auto device_id = 0; + + auto vmm_start_addr = FindVmmSegment(addr); + if (vmm_start_addr == nullptr) { + LOG_ERROR << "Can not find the vmm segment."; + return 0; + } + aclrtPhysicalMemProp prop = {}; + prop.handleType = ACL_MEM_HANDLE_TYPE_NONE; + prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED; + prop.memAttr = ACL_HBM_MEM_HUGE; + prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE; + prop.location.id = device_id; + prop.reserve = 0; + auto start_offset = CalAddressOffset(addr, vmm_start_addr); + auto align_size = GetRoundUpAlignSize(size + start_offset); + auto handle_size = GetHandleSize(align_size); + auto iter = vmm_map_.find(vmm_start_addr); + + std::map mapped_vmm_handle; + for (size_t i = 0; i < handle_size; ++i) { + auto new_addr = AddressOffset(vmm_start_addr, i * vmm_align_size_); + if (iter == vmm_map_.end() || iter->first != new_addr) { + LOG_ERROR << "Can not find the vmm segment."; + return 0; + } + if (iter->second != nullptr) { + iter++; + continue; + } + aclrtDrvMemHandle handle = nullptr; + if (!cached_handle_sets_.empty()) { + handle = *cached_handle_sets_.begin(); + cached_handle_sets_.erase(cached_handle_sets_.begin()); + } else { + if (physical_handle_size_ * vmm_align_size_ >= max_size) { + LOG_OUT << "Mapped too much memory, physical_handle_size_ : " << physical_handle_size_ + << ", max_size : " << max_size << ", addr : " << addr << ", size : " << size << "."; + MoveBackMappedHandle(&mapped_vmm_handle, &vmm_map_, &cached_handle_sets_); + return 0; + } + + auto ret = CALL_ASCEND_API(aclrtMallocPhysical, &handle, vmm_align_size_, &prop, 0); + if (ret != ACL_SUCCESS) { + size_t used_handle_size = 0; + for (const auto &[k, v] : vmm_map_) { + if (v != nullptr) { + LOG_OUT << "Inuse handle address : " << k << ", handle : " << v << "."; + used_handle_size += 1; + } + } + used_handle_size += cached_handle_sets_.size(); + // This may be a normal case at the memory usage boundary. + LOG_OUT << "Malloc physical memory failed, inuse physical memory handle size : " << used_handle_size + << ", physical_handle_size_ size : " << physical_handle_size_ << "."; + MoveBackMappedHandle(&mapped_vmm_handle, &vmm_map_, &cached_handle_sets_); + return 0; + } else { + physical_handle_size_++; + if (physical_handle_size_ * vmm_align_size_ >= max_size) { + LOG_OUT << "Mapped too much memory, physical_handle_size_ : " << physical_handle_size_ + << ", max_size : " << max_size << "."; + } + } + } + + auto ret = CALL_ASCEND_API(aclrtMapMem, new_addr, vmm_align_size_, 0, handle, 0); + if (ret != ACL_SUCCESS) { + LOG_ERROR << "Map memory failed."; + cached_handle_sets_.insert(handle); + MoveBackMappedHandle(&mapped_vmm_handle, &vmm_map_, &cached_handle_sets_); + return 0; + } + mapped_vmm_handle[iter->first] = handle; + iter->second = handle; + iter++; + } + + return size; +} + +size_t AscendVmmAdapter::AllocDeviceMem(size_t size, DeviceMemPtr *addr) { + CHECK_IF_NULL(addr); + size_t align_size = GetRoundUpAlignSize(size); + LOG_OUT << "VMM AllocDeviceMem size:" << size << ", align_size:" << align_size; + auto ret = CALL_ASCEND_API(aclrtReserveMemAddress, addr, align_size, 0, nullptr, 1); + if (ret != ACL_SUCCESS) { + LOG_ERROR << "Reserve memory address failed."; + return 0; + } + all_reserve_mems_.push_back(*addr); + auto handle_size = GetHandleSize(align_size); + for (size_t i = 0; i < handle_size; i++) { + auto new_addr = AddressOffset(*addr, i * vmm_align_size_); + vmm_map_[new_addr] = nullptr; + } + return align_size; +} + +size_t AscendVmmAdapter::EagerFreeDeviceMem(const DeviceMemPtr addr, const size_t size) { + CHECK_IF_NULL(addr); + LOG_OUT << "Eager free device mem addr :" << addr << ", size :" << size + << ", cached_handle_sets_ size : " << cached_handle_sets_.size() << "."; + size_t ret_size = 0; + auto iter = vmm_map_.lower_bound(addr); + if (iter == vmm_map_.end()) { + // Memory less than 2MB may be at the end of a vmm segment, and it's a normal case. + if (size >= vmm_align_size_) { + LOG_ERROR << "Can not find the vmm segment."; + } + return 0; + } + auto vmm_start_addr = iter->first; + auto free_end_addr = AddressOffset(addr, size); + while (true) { + auto vmm_end_addr = AddressOffset(vmm_start_addr, vmm_align_size_); + if (vmm_end_addr > free_end_addr) { + break; + } + if (iter == vmm_map_.end() || iter->first != vmm_start_addr) { + LOG_ERROR << "Can not find the vmm segment."; + return 0; + } + if (iter->second == nullptr) { + iter++; + vmm_start_addr = vmm_end_addr; + // Here nullptr may be huge, skip do logging. + continue; + } + auto ret = CALL_ASCEND_API(aclrtUnmapMem, vmm_start_addr); + if (ret != ACL_SUCCESS) { + LOG_ERROR << "Unmap memory failed."; + return 0; + } + cached_handle_sets_.insert(iter->second); + iter->second = nullptr; + iter++; + vmm_start_addr = vmm_end_addr; + ret_size += vmm_align_size_; + } + LOG_OUT << "After eager free, cached_handle_sets_ size : " << cached_handle_sets_.size() + << ", expected free size : " << size << ", real size : " << ret_size << "."; + return ret_size; +} + +size_t AscendVmmAdapter::EmptyCache() { + size_t empty_size = 0L; + for (auto iter = cached_handle_sets_.begin(); iter != cached_handle_sets_.end(); iter++) { + auto ret = CALL_ASCEND_API(aclrtFreePhysical, *iter); + if (ret != ACL_SUCCESS) { + LOG_ERROR << "Free physical memory failed."; + } + } + + size_t cache_handle_size = cached_handle_sets_.size(); + physical_handle_size_ -= cache_handle_size; + empty_size += cache_handle_size * vmm_align_size_; + cached_handle_sets_.clear(); + LOG_OUT << "Empty cache size: " << empty_size << ", cached handle set size: " << cached_handle_sets_.size() << "."; + return empty_size; +} +} // namespace ascend +} // namespace device +} // namespace mindspore diff --git a/inferrt/src/hardware/ascend/res_manager/mem_manager/ascend_vmm_adapter.h b/inferrt/src/hardware/ascend/res_manager/mem_manager/ascend_vmm_adapter.h new file mode 100644 index 0000000000000000000000000000000000000000..4a32f3818d08eb2fe92d8fcd4355f56a02968c33 --- /dev/null +++ b/inferrt/src/hardware/ascend/res_manager/mem_manager/ascend_vmm_adapter.h @@ -0,0 +1,167 @@ +/** + * Copyright 2024 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_ASCEND_VMM_ADAPTER_H_ +#define MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_ASCEND_VMM_ADAPTER_H_ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "acl/acl.h" +#include "hardware/hardware_abstract/dlopen_macro.h" +#include "common/common.h" + +#include "hardware/hardware_abstract/visible.h" + +namespace mindspore { +namespace device { +namespace ascend { +using DeviceMemPtr = void(*); +class HARDWARE_EXPORT AscendVmmAdapter { + public: + static AscendVmmAdapter &GetInstance() { + static AscendVmmAdapter instance{}; + return instance; + } + + AscendVmmAdapter() { + vmm_align_size_ = kDefaultAlignSize; + + LOG_OUT << "VMM align size is " << vmm_align_size_; + } + ~AscendVmmAdapter() = default; + + public: + size_t GetRoundUpAlignSize(size_t input_size) const; + size_t GetRoundDownAlignSize(size_t input_size) const; + + void ClearAllMemory(); + size_t AllocDeviceMem(size_t size, DeviceMemPtr *addr); + size_t MmapDeviceMem(const size_t size, const DeviceMemPtr addr, const size_t max_size); + size_t EagerFreeDeviceMem(const DeviceMemPtr addr, const size_t size); + size_t GetAllocatedSize() { return physical_handle_size_ * vmm_align_size_; } + + size_t EmptyCache(); + + static const bool IsEnabled() { + static bool is_enable_vmm = IsVmmEnabled(); + return is_enable_vmm; + } + + private: + static const bool IsVmmEnabled() { + if (!CheckVmmDriverVersion()) { + return false; + } + + LOG_OUT << "VMM is enabled."; + return true; + } + + private: + uint64_t vmm_align_size_; + DeviceMemPtr FindVmmSegment(const DeviceMemPtr addr); + size_t GetHandleSize(size_t input_size); + std::atomic physical_handle_size_{0}; + std::map vmm_map_; + std::vector all_reserve_mems_; + std::set cached_handle_sets_; + static constexpr uint64_t kMB = 1024 * 1024; + static constexpr uint64_t kDefaultAlignSize = 2 * kMB; + static int StringToMB(const std::string &str) { + std::stringstream ss(str); + int num; + std::string unit; + if (!(ss >> num)) { + LOG_ERROR << "No valid number could be extracted from the string, " << str; + } + if (!(ss >> unit) || unit != "MB") { + LOG_ERROR << "The unit of the string is not MB, " << str; + } + if (ss.rdbuf()->in_avail() > 0) { + LOG_ERROR << "The string has extra characters, " << str; + } + return num; + } + static bool CheckVmmDriverVersion() { + // Get driver version + constexpr auto ascend_install_info = "/etc/ascend_install.info"; + const std::string DRIVER_INSTALL_PATH_PARAM = "Driver_Install_Path_Param="; + std::string driver_path = "/usr/local/Ascend"; + + std::ifstream ascend_install_file(ascend_install_info); + if (!ascend_install_file.is_open()) { + LOG_OUT << "Open file " << ascend_install_info << " failed."; + } else { + std::string line; + while (std::getline(ascend_install_file, line)) { + size_t pos = line.find(DRIVER_INSTALL_PATH_PARAM); + if (pos != std::string::npos) { + // Extract the path after "Driver_Install_Path_Param=" + driver_path = line.substr(pos + DRIVER_INSTALL_PATH_PARAM.length()); + LOG_OUT << "Driver path is " << driver_path; + break; + } + } + } + + auto splitString = [](const std::string &str, char delimiter) -> std::vector { + std::vector tokens; + std::string token; + std::istringstream tokenStream(str); + while (std::getline(tokenStream, token, delimiter)) { + tokens.push_back(token); + } + return tokens; + }; + + auto driver_version_info = driver_path + "/driver/version.info"; + const std::string DRIVER_VERSION_PARAM = "Version="; + std::ifstream driver_version_file(driver_version_info); + if (!driver_version_file.is_open()) { + LOG_OUT << "Open file " << driver_version_info << " failed."; + } else { + std::string line; + while (std::getline(driver_version_file, line)) { + size_t pos = line.find(DRIVER_VERSION_PARAM); + if (pos != std::string::npos) { + // Extract the version after "Version=" + std::string driver_version = line.substr(pos + DRIVER_VERSION_PARAM.length()); + auto split_version = splitString(driver_version, '.'); + LOG_OUT << "Driver version is " << driver_version << ", major version is " << split_version[0]; + if (split_version[0] < "24") { + LOG_OUT << "Driver version is less than 24.0.0, vmm is disabled by default, drvier_version: " + << driver_version; + return false; + } + break; + } + } + } + return true; + } +}; +} // namespace ascend +} // namespace device +} // namespace mindspore + +#endif diff --git a/inferrt/src/hardware/ascend/res_manager/CMakeLists.txt b/inferrt/src/hardware/ascend/res_manager/symbol_interface/CMakeLists.txt similarity index 100% rename from inferrt/src/hardware/ascend/res_manager/CMakeLists.txt rename to inferrt/src/hardware/ascend/res_manager/symbol_interface/CMakeLists.txt diff --git a/inferrt/src/hardware/ascend/res_manager/symbol_interface/acl_base_symbol.cc b/inferrt/src/hardware/ascend/res_manager/symbol_interface/acl_base_symbol.cc new file mode 100644 index 0000000000000000000000000000000000000000..d1888495c4f855509ff9c0c8a5fa6a5cbc27c96d --- /dev/null +++ b/inferrt/src/hardware/ascend/res_manager/symbol_interface/acl_base_symbol.cc @@ -0,0 +1,85 @@ +/** + * Copyright 2024 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "acl_base_symbol.h" +#include +#include "symbol_utils.h" + +namespace mindspore::device::ascend { +aclCreateDataBufferFunObj aclCreateDataBuffer_ = nullptr; +aclCreateTensorDescFunObj aclCreateTensorDesc_ = nullptr; +aclDataTypeSizeFunObj aclDataTypeSize_ = nullptr; +aclDestroyDataBufferFunObj aclDestroyDataBuffer_ = nullptr; +aclDestroyTensorDescFunObj aclDestroyTensorDesc_ = nullptr; +aclGetTensorDescDimV2FunObj aclGetTensorDescDimV2_ = nullptr; +aclGetTensorDescNumDimsFunObj aclGetTensorDescNumDims_ = nullptr; +aclSetTensorConstFunObj aclSetTensorConst_ = nullptr; +aclSetTensorDescNameFunObj aclSetTensorDescName_ = nullptr; +aclSetTensorFormatFunObj aclSetTensorFormat_ = nullptr; +aclSetTensorPlaceMentFunObj aclSetTensorPlaceMent_ = nullptr; +aclSetTensorShapeFunObj aclSetTensorShape_ = nullptr; +aclrtGetSocNameFunObj aclrtGetSocName_ = nullptr; +aclUpdateDataBufferFunObj aclUpdateDataBuffer_ = nullptr; +aclGetDataBufferAddrFunObj aclGetDataBufferAddr_ = nullptr; +aclGetTensorDescSizeFunObj aclGetTensorDescSize_ = nullptr; +aclGetRecentErrMsgFunObj aclGetRecentErrMsg_ = nullptr; + +void LoadAclBaseApiSymbol(const std::string &ascend_path) { + std::string aclbase_plugin_path = "lib64/libascendcl.so"; + auto base_handler = GetLibHandler(ascend_path + aclbase_plugin_path); + if (base_handler == nullptr) { + LOG_OUT << "Dlopen " << aclbase_plugin_path << " failed!" << dlerror(); + return; + } + aclCreateDataBuffer_ = DlsymAscendFuncObj(aclCreateDataBuffer, base_handler); + aclCreateTensorDesc_ = DlsymAscendFuncObj(aclCreateTensorDesc, base_handler); + aclDataTypeSize_ = DlsymAscendFuncObj(aclDataTypeSize, base_handler); + aclDestroyDataBuffer_ = DlsymAscendFuncObj(aclDestroyDataBuffer, base_handler); + aclDestroyTensorDesc_ = DlsymAscendFuncObj(aclDestroyTensorDesc, base_handler); + aclGetTensorDescDimV2_ = DlsymAscendFuncObj(aclGetTensorDescDimV2, base_handler); + aclGetTensorDescNumDims_ = DlsymAscendFuncObj(aclGetTensorDescNumDims, base_handler); + aclSetTensorConst_ = DlsymAscendFuncObj(aclSetTensorConst, base_handler); + aclSetTensorDescName_ = DlsymAscendFuncObj(aclSetTensorDescName, base_handler); + aclSetTensorFormat_ = DlsymAscendFuncObj(aclSetTensorFormat, base_handler); + aclSetTensorPlaceMent_ = DlsymAscendFuncObj(aclSetTensorPlaceMent, base_handler); + aclSetTensorShape_ = DlsymAscendFuncObj(aclSetTensorShape, base_handler); + aclrtGetSocName_ = DlsymAscendFuncObj(aclrtGetSocName, base_handler); + aclUpdateDataBuffer_ = DlsymAscendFuncObj(aclUpdateDataBuffer, base_handler); + aclGetDataBufferAddr_ = DlsymAscendFuncObj(aclGetDataBufferAddr, base_handler); + aclGetTensorDescSize_ = DlsymAscendFuncObj(aclGetTensorDescSize, base_handler); + aclGetRecentErrMsg_ = DlsymAscendFuncObj(aclGetRecentErrMsg, base_handler); + LOG_OUT << "Load acl base api success!"; +} + +void LoadSimulationAclBaseApi() { + ASSIGN_SIMU(aclCreateDataBuffer); + ASSIGN_SIMU(aclCreateTensorDesc); + ASSIGN_SIMU(aclDataTypeSize); + ASSIGN_SIMU(aclDestroyDataBuffer); + ASSIGN_SIMU(aclDestroyTensorDesc); + ASSIGN_SIMU(aclGetTensorDescDimV2); + ASSIGN_SIMU(aclGetTensorDescNumDims); + ASSIGN_SIMU(aclSetTensorConst); + ASSIGN_SIMU(aclSetTensorDescName); + ASSIGN_SIMU(aclSetTensorFormat); + ASSIGN_SIMU(aclSetTensorPlaceMent); + ASSIGN_SIMU(aclSetTensorShape); + ASSIGN_SIMU(aclUpdateDataBuffer); + ASSIGN_SIMU(aclrtGetSocName); + ASSIGN_SIMU(aclGetDataBufferAddr); + ASSIGN_SIMU(aclGetTensorDescSize); + ASSIGN_SIMU(aclGetRecentErrMsg); +} +} // namespace mindspore::device::ascend diff --git a/inferrt/src/hardware/ascend/res_manager/symbol_interface/acl_base_symbol.h b/inferrt/src/hardware/ascend/res_manager/symbol_interface/acl_base_symbol.h new file mode 100644 index 0000000000000000000000000000000000000000..8e2d8fe3258954bf28570904c7d58f4b35014b45 --- /dev/null +++ b/inferrt/src/hardware/ascend/res_manager/symbol_interface/acl_base_symbol.h @@ -0,0 +1,45 @@ +/** + * Copyright 2024 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_CCSRC_TRANSFORM_SYMBOL_ACL_BASE_SYMBOL_H_ +#define MINDSPORE_CCSRC_TRANSFORM_SYMBOL_ACL_BASE_SYMBOL_H_ +#include +#include "acl/acl_base.h" +#include "hardware/hardware_abstract/dlopen_macro.h" + +namespace mindspore::device::ascend { +ORIGIN_METHOD_WITH_SIMU(aclCreateDataBuffer, aclDataBuffer *, void *, size_t); +ORIGIN_METHOD_WITH_SIMU(aclCreateTensorDesc, aclTensorDesc *, aclDataType, int, const int64_t *, aclFormat); +ORIGIN_METHOD_WITH_SIMU(aclDataTypeSize, size_t, aclDataType); +ORIGIN_METHOD_WITH_SIMU(aclDestroyDataBuffer, aclError, const aclDataBuffer *); +ORIGIN_METHOD_WITH_SIMU(aclDestroyTensorDesc, void, const aclTensorDesc *); +ORIGIN_METHOD_WITH_SIMU(aclGetTensorDescDimV2, aclError, const aclTensorDesc *, size_t, int64_t *); +ORIGIN_METHOD_WITH_SIMU(aclGetTensorDescNumDims, size_t, const aclTensorDesc *) +ORIGIN_METHOD_WITH_SIMU(aclSetTensorConst, aclError, aclTensorDesc *, void *, size_t) +ORIGIN_METHOD_WITH_SIMU(aclSetTensorDescName, void, aclTensorDesc *, const char *) +ORIGIN_METHOD_WITH_SIMU(aclSetTensorFormat, aclError, aclTensorDesc *, aclFormat) +ORIGIN_METHOD_WITH_SIMU(aclSetTensorPlaceMent, aclError, aclTensorDesc *, aclMemType) +ORIGIN_METHOD_WITH_SIMU(aclSetTensorShape, aclError, aclTensorDesc *, int, const int64_t *) +ACLRT_GET_SOC_NAME_WITH_SIMU(aclrtGetSocName, const char *) +ORIGIN_METHOD_WITH_SIMU(aclUpdateDataBuffer, aclError, aclDataBuffer *, void *, size_t) +ORIGIN_METHOD_WITH_SIMU(aclGetDataBufferAddr, void *, const aclDataBuffer *) +ORIGIN_METHOD_WITH_SIMU(aclGetTensorDescSize, size_t, const aclTensorDesc *) +ORIGIN_METHOD_WITH_SIMU(aclGetRecentErrMsg, const char *) + +void LoadAclBaseApiSymbol(const std::string &ascend_path); +void LoadSimulationAclBaseApi(); +} // namespace mindspore::device::ascend + +#endif // MINDSPORE_CCSRC_TRANSFORM_SYMBOL_ACL_BASE_SYMBOL_H_ diff --git a/inferrt/src/hardware/ascend/res_manager/symbol_interface/acl_compiler_symbol.cc b/inferrt/src/hardware/ascend/res_manager/symbol_interface/acl_compiler_symbol.cc new file mode 100644 index 0000000000000000000000000000000000000000..f3d36395a4c1936a16e4b9f1a4c3340a5f832849 --- /dev/null +++ b/inferrt/src/hardware/ascend/res_manager/symbol_interface/acl_compiler_symbol.cc @@ -0,0 +1,49 @@ +/** + * Copyright 2024 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "acl_compiler_symbol.h" +#include +#include "symbol_utils.h" + +namespace mindspore::device::ascend { +aclopCompileAndExecuteFunObj aclopCompileAndExecute_ = nullptr; +aclopCompileAndExecuteV2FunObj aclopCompileAndExecuteV2_ = nullptr; +aclSetCompileoptFunObj aclSetCompileopt_ = nullptr; +aclopSetCompileFlagFunObj aclopSetCompileFlag_ = nullptr; +aclGenGraphAndDumpForOpFunObj aclGenGraphAndDumpForOp_ = nullptr; + +void LoadAclOpCompilerApiSymbol(const std::string &ascend_path) { + std::string complier_plugin_path = ascend_path + "lib64/libacl_op_compiler.so"; + auto handler = GetLibHandler(complier_plugin_path); + if (handler == nullptr) { + LOG_OUT << "Dlopen " << complier_plugin_path << " failed!" << dlerror(); + return; + } + aclopCompileAndExecute_ = DlsymAscendFuncObj(aclopCompileAndExecute, handler); + aclopCompileAndExecuteV2_ = DlsymAscendFuncObj(aclopCompileAndExecuteV2, handler); + aclSetCompileopt_ = DlsymAscendFuncObj(aclSetCompileopt, handler); + aclopSetCompileFlag_ = DlsymAscendFuncObj(aclopSetCompileFlag, handler); + aclGenGraphAndDumpForOp_ = DlsymAscendFuncObj(aclGenGraphAndDumpForOp, handler); + LOG_OUT << "Load acl op compiler api success!"; +} + +void LoadSimulationAclOpCompilerApi() { + ASSIGN_SIMU(aclopCompileAndExecute); + ASSIGN_SIMU(aclopCompileAndExecuteV2); + ASSIGN_SIMU(aclSetCompileopt); + ASSIGN_SIMU(aclopSetCompileFlag); + ASSIGN_SIMU(aclGenGraphAndDumpForOp); +} +} // namespace mindspore::device::ascend diff --git a/inferrt/src/hardware/ascend/res_manager/symbol_interface/acl_compiler_symbol.h b/inferrt/src/hardware/ascend/res_manager/symbol_interface/acl_compiler_symbol.h new file mode 100644 index 0000000000000000000000000000000000000000..bc9b7cd46fc15264613ce78053e53fff02da9460 --- /dev/null +++ b/inferrt/src/hardware/ascend/res_manager/symbol_interface/acl_compiler_symbol.h @@ -0,0 +1,40 @@ +/** + * Copyright 2024 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_CCSRC_TRANSFORM_SYMBOL_ACL_COMPILER_SYMBOL_H_ +#define MINDSPORE_CCSRC_TRANSFORM_SYMBOL_ACL_COMPILER_SYMBOL_H_ +#include +#include "acl/acl_op_compiler.h" +#include "hardware/hardware_abstract/dlopen_macro.h" + +namespace mindspore::device::ascend { + +ORIGIN_METHOD_WITH_SIMU(aclopCompileAndExecute, aclError, const char *, int, const aclTensorDesc *const[], + const aclDataBuffer *const[], int, const aclTensorDesc *const[], aclDataBuffer *const[], + const aclopAttr *, aclopEngineType, aclopCompileType, const char *, aclrtStream); +ORIGIN_METHOD_WITH_SIMU(aclopCompileAndExecuteV2, aclError, const char *, int, aclTensorDesc *[], aclDataBuffer *[], + int, aclTensorDesc *[], aclDataBuffer *[], aclopAttr *, aclopEngineType, aclopCompileType, + const char *, aclrtStream); +ORIGIN_METHOD_WITH_SIMU(aclSetCompileopt, aclError, aclCompileOpt, const char *); +ORIGIN_METHOD_WITH_SIMU(aclopSetCompileFlag, aclError, aclOpCompileFlag); +ORIGIN_METHOD_WITH_SIMU(aclGenGraphAndDumpForOp, aclError, const char *, int, const aclTensorDesc *const[], + const aclDataBuffer *const[], int, const aclTensorDesc *const[], aclDataBuffer *const[], + const aclopAttr *, aclopEngineType, const char *, const aclGraphDumpOption *); + +void LoadAclOpCompilerApiSymbol(const std::string &ascend_path); +void LoadSimulationAclOpCompilerApi(); +} // namespace mindspore::device::ascend + +#endif // MINDSPORE_CCSRC_TRANSFORM_SYMBOL_ACL_COMPILER_SYMBOL_H_ diff --git a/inferrt/src/hardware/ascend/res_manager/symbol_interface/acl_mdl_symbol.cc b/inferrt/src/hardware/ascend/res_manager/symbol_interface/acl_mdl_symbol.cc new file mode 100644 index 0000000000000000000000000000000000000000..c4bddd8e822d8c1520a1974aa0def1eb7650477a --- /dev/null +++ b/inferrt/src/hardware/ascend/res_manager/symbol_interface/acl_mdl_symbol.cc @@ -0,0 +1,185 @@ +/** + * Copyright 2024 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "acl_mdl_symbol.h" +#include +#include "symbol_utils.h" + +namespace mindspore::device::ascend { +aclmdlAddDatasetBufferFunObj aclmdlAddDatasetBuffer_ = nullptr; +aclmdlCreateDatasetFunObj aclmdlCreateDataset_ = nullptr; +aclmdlCreateDescFunObj aclmdlCreateDesc_ = nullptr; +aclmdlDestroyDatasetFunObj aclmdlDestroyDataset_ = nullptr; +aclmdlDestroyDescFunObj aclmdlDestroyDesc_ = nullptr; +aclmdlExecuteFunObj aclmdlExecute_ = nullptr; +aclmdlFinalizeDumpFunObj aclmdlFinalizeDump_ = nullptr; +aclmdlGetCurOutputDimsFunObj aclmdlGetCurOutputDims_ = nullptr; +aclmdlGetDatasetBufferFunObj aclmdlGetDatasetBuffer_ = nullptr; +aclmdlGetDatasetNumBuffersFunObj aclmdlGetDatasetNumBuffers_ = nullptr; +aclmdlGetDescFunObj aclmdlGetDesc_ = nullptr; +aclmdlGetInputDataTypeFunObj aclmdlGetInputDataType_ = nullptr; +aclmdlGetInputDimsFunObj aclmdlGetInputDims_ = nullptr; +aclmdlGetInputIndexByNameFunObj aclmdlGetInputIndexByName_ = nullptr; +aclmdlGetInputNameByIndexFunObj aclmdlGetInputNameByIndex_ = nullptr; +aclmdlGetInputSizeByIndexFunObj aclmdlGetInputSizeByIndex_ = nullptr; +aclmdlGetNumInputsFunObj aclmdlGetNumInputs_ = nullptr; +aclmdlGetNumOutputsFunObj aclmdlGetNumOutputs_ = nullptr; +aclmdlGetOutputDataTypeFunObj aclmdlGetOutputDataType_ = nullptr; +aclmdlGetOutputDimsFunObj aclmdlGetOutputDims_ = nullptr; +aclmdlGetOutputNameByIndexFunObj aclmdlGetOutputNameByIndex_ = nullptr; +aclmdlGetOutputSizeByIndexFunObj aclmdlGetOutputSizeByIndex_ = nullptr; +aclmdlInitDumpFunObj aclmdlInitDump_ = nullptr; +aclmdlLoadFromMemFunObj aclmdlLoadFromMem_ = nullptr; +aclmdlSetDumpFunObj aclmdlSetDump_ = nullptr; +aclmdlSetDynamicBatchSizeFunObj aclmdlSetDynamicBatchSize_ = nullptr; +aclmdlUnloadFunObj aclmdlUnload_ = nullptr; +aclmdlQuerySizeFromMemFunObj aclmdlQuerySizeFromMem_ = nullptr; +aclmdlBundleGetModelIdFunObj aclmdlBundleGetModelId_ = nullptr; +aclmdlBundleLoadFromMemFunObj aclmdlBundleLoadFromMem_ = nullptr; +aclmdlBundleUnloadFunObj aclmdlBundleUnload_ = nullptr; +aclmdlLoadFromMemWithMemFunObj aclmdlLoadFromMemWithMem_ = nullptr; +aclmdlSetDatasetTensorDescFunObj aclmdlSetDatasetTensorDesc_ = nullptr; +aclmdlGetInputFormatFunObj aclmdlGetInputFormat_ = nullptr; +aclmdlGetDatasetTensorDescFunObj aclmdlGetDatasetTensorDesc_ = nullptr; +aclmdlSetInputDynamicDimsFunObj aclmdlSetInputDynamicDims_ = nullptr; +aclmdlGetOutputFormatFunObj aclmdlGetOutputFormat_ = nullptr; +aclmdlGetInputDimsV2FunObj aclmdlGetInputDimsV2_ = nullptr; +aclmdlGetDynamicHWFunObj aclmdlGetDynamicHW_ = nullptr; +aclmdlGetInputDynamicDimsFunObj aclmdlGetInputDynamicDims_ = nullptr; +aclmdlGetInputDynamicGearCountFunObj aclmdlGetInputDynamicGearCount_ = nullptr; +aclmdlGetDynamicBatchFunObj aclmdlGetDynamicBatch_ = nullptr; +aclmdlSetDynamicHWSizeFunObj aclmdlSetDynamicHWSize_ = nullptr; +#if defined(__linux__) && defined(WITH_BACKEND) +aclmdlRICaptureBeginFunObj aclmdlRICaptureBegin_ = nullptr; +aclmdlRICaptureGetInfoFunObj aclmdlRICaptureGetInfo_ = nullptr; +aclmdlRICaptureEndFunObj aclmdlRICaptureEnd_ = nullptr; +aclmdlRIExecuteAsyncFunObj aclmdlRIExecuteAsync_ = nullptr; +aclmdlRIDestroyFunObj aclmdlRIDestroy_ = nullptr; +#endif + +void LoadAclMdlApiSymbol(const std::string &ascend_path) { + std::string aclmdl_plugin_path = ascend_path + "lib64/libascendcl.so"; + auto handler = GetLibHandler(aclmdl_plugin_path); + if (handler == nullptr) { + LOG_OUT << "Dlopen " << aclmdl_plugin_path << " failed!" << dlerror(); + return; + } + aclmdlAddDatasetBuffer_ = DlsymAscendFuncObj(aclmdlAddDatasetBuffer, handler); + aclmdlCreateDataset_ = DlsymAscendFuncObj(aclmdlCreateDataset, handler); + aclmdlCreateDesc_ = DlsymAscendFuncObj(aclmdlCreateDesc, handler); + aclmdlDestroyDataset_ = DlsymAscendFuncObj(aclmdlDestroyDataset, handler); + aclmdlDestroyDesc_ = DlsymAscendFuncObj(aclmdlDestroyDesc, handler); + aclmdlExecute_ = DlsymAscendFuncObj(aclmdlExecute, handler); + aclmdlFinalizeDump_ = DlsymAscendFuncObj(aclmdlFinalizeDump, handler); + aclmdlGetCurOutputDims_ = DlsymAscendFuncObj(aclmdlGetCurOutputDims, handler); + aclmdlGetDatasetBuffer_ = DlsymAscendFuncObj(aclmdlGetDatasetBuffer, handler); + aclmdlGetDatasetNumBuffers_ = DlsymAscendFuncObj(aclmdlGetDatasetNumBuffers, handler); + aclmdlGetDesc_ = DlsymAscendFuncObj(aclmdlGetDesc, handler); + aclmdlGetInputDataType_ = DlsymAscendFuncObj(aclmdlGetInputDataType, handler); + aclmdlGetInputDims_ = DlsymAscendFuncObj(aclmdlGetInputDims, handler); + aclmdlGetInputIndexByName_ = DlsymAscendFuncObj(aclmdlGetInputIndexByName, handler); + aclmdlGetInputNameByIndex_ = DlsymAscendFuncObj(aclmdlGetInputNameByIndex, handler); + aclmdlGetInputSizeByIndex_ = DlsymAscendFuncObj(aclmdlGetInputSizeByIndex, handler); + aclmdlGetNumInputs_ = DlsymAscendFuncObj(aclmdlGetNumInputs, handler); + aclmdlGetNumOutputs_ = DlsymAscendFuncObj(aclmdlGetNumOutputs, handler); + aclmdlGetOutputDataType_ = DlsymAscendFuncObj(aclmdlGetOutputDataType, handler); + aclmdlGetOutputDims_ = DlsymAscendFuncObj(aclmdlGetOutputDims, handler); + aclmdlQuerySizeFromMem_ = DlsymAscendFuncObj(aclmdlQuerySizeFromMem, handler); + aclmdlGetOutputNameByIndex_ = DlsymAscendFuncObj(aclmdlGetOutputNameByIndex, handler); + aclmdlGetOutputSizeByIndex_ = DlsymAscendFuncObj(aclmdlGetOutputSizeByIndex, handler); + aclmdlInitDump_ = DlsymAscendFuncObj(aclmdlInitDump, handler); + aclmdlLoadFromMem_ = DlsymAscendFuncObj(aclmdlLoadFromMem, handler); + aclmdlSetDump_ = DlsymAscendFuncObj(aclmdlSetDump, handler); + aclmdlSetDynamicBatchSize_ = DlsymAscendFuncObj(aclmdlSetDynamicBatchSize, handler); + aclmdlUnload_ = DlsymAscendFuncObj(aclmdlUnload, handler); + aclmdlBundleGetModelId_ = DlsymAscendFuncObj(aclmdlBundleGetModelId, handler); + aclmdlBundleLoadFromMem_ = DlsymAscendFuncObj(aclmdlBundleLoadFromMem, handler); + aclmdlBundleUnload_ = DlsymAscendFuncObj(aclmdlBundleUnload, handler); + aclmdlLoadFromMemWithMem_ = DlsymAscendFuncObj(aclmdlLoadFromMemWithMem, handler); + aclmdlSetDatasetTensorDesc_ = DlsymAscendFuncObj(aclmdlSetDatasetTensorDesc, handler); + aclmdlGetInputFormat_ = DlsymAscendFuncObj(aclmdlGetInputFormat, handler); + aclmdlGetDatasetTensorDesc_ = DlsymAscendFuncObj(aclmdlGetDatasetTensorDesc, handler); + aclmdlSetInputDynamicDims_ = DlsymAscendFuncObj(aclmdlSetInputDynamicDims, handler); + aclmdlGetOutputFormat_ = DlsymAscendFuncObj(aclmdlGetOutputFormat, handler); + aclmdlGetInputDimsV2_ = DlsymAscendFuncObj(aclmdlGetInputDimsV2, handler); + aclmdlGetDynamicHW_ = DlsymAscendFuncObj(aclmdlGetDynamicHW, handler); + aclmdlGetInputDynamicDims_ = DlsymAscendFuncObj(aclmdlGetInputDynamicDims, handler); + aclmdlGetInputDynamicGearCount_ = DlsymAscendFuncObj(aclmdlGetInputDynamicGearCount, handler); + aclmdlGetDynamicBatch_ = DlsymAscendFuncObj(aclmdlGetDynamicBatch, handler); + aclmdlSetDynamicHWSize_ = DlsymAscendFuncObj(aclmdlSetDynamicHWSize, handler); +#if defined(__linux__) && defined(WITH_BACKEND) + aclmdlRICaptureBegin_ = DlsymAscendFuncObj(aclmdlRICaptureBegin, handler); + aclmdlRICaptureGetInfo_ = DlsymAscendFuncObj(aclmdlRICaptureGetInfo, handler); + aclmdlRICaptureEnd_ = DlsymAscendFuncObj(aclmdlRICaptureEnd, handler); + aclmdlRIExecuteAsync_ = DlsymAscendFuncObj(aclmdlRIExecuteAsync, handler); + aclmdlRIDestroy_ = DlsymAscendFuncObj(aclmdlRIDestroy, handler); +#endif + + LOG_OUT << "Load acl mdl api success!"; +} + +void LoadSimulationAclMdlApi() { + ASSIGN_SIMU(aclmdlAddDatasetBuffer); + ASSIGN_SIMU(aclmdlCreateDataset); + ASSIGN_SIMU(aclmdlCreateDesc); + ASSIGN_SIMU(aclmdlDestroyDataset); + ASSIGN_SIMU(aclmdlDestroyDesc); + ASSIGN_SIMU(aclmdlExecute); + ASSIGN_SIMU(aclmdlFinalizeDump); + ASSIGN_SIMU(aclmdlGetCurOutputDims); + ASSIGN_SIMU(aclmdlGetDatasetBuffer); + ASSIGN_SIMU(aclmdlGetDatasetNumBuffers); + ASSIGN_SIMU(aclmdlGetDesc); + ASSIGN_SIMU(aclmdlGetInputDataType); + ASSIGN_SIMU(aclmdlGetInputDims); + ASSIGN_SIMU(aclmdlGetInputIndexByName); + ASSIGN_SIMU(aclmdlGetInputNameByIndex); + ASSIGN_SIMU(aclmdlGetInputSizeByIndex); + ASSIGN_SIMU(aclmdlGetNumInputs); + ASSIGN_SIMU(aclmdlGetNumOutputs); + ASSIGN_SIMU(aclmdlGetOutputDataType); + ASSIGN_SIMU(aclmdlGetOutputDims); + ASSIGN_SIMU(aclmdlGetOutputNameByIndex); + ASSIGN_SIMU(aclmdlGetOutputSizeByIndex); + ASSIGN_SIMU(aclmdlInitDump); + ASSIGN_SIMU(aclmdlLoadFromMem); + ASSIGN_SIMU(aclmdlSetDump); + ASSIGN_SIMU(aclmdlSetDynamicBatchSize); + ASSIGN_SIMU(aclmdlUnload); + ASSIGN_SIMU(aclmdlQuerySizeFromMem); + ASSIGN_SIMU(aclmdlBundleGetModelId); + ASSIGN_SIMU(aclmdlBundleLoadFromMem); + ASSIGN_SIMU(aclmdlBundleUnload); + ASSIGN_SIMU(aclmdlLoadFromMemWithMem); + ASSIGN_SIMU(aclmdlSetDatasetTensorDesc); + ASSIGN_SIMU(aclmdlGetInputFormat); + ASSIGN_SIMU(aclmdlGetDatasetTensorDesc); + ASSIGN_SIMU(aclmdlSetInputDynamicDims); + ASSIGN_SIMU(aclmdlGetOutputFormat); + ASSIGN_SIMU(aclmdlGetInputDimsV2); + ASSIGN_SIMU(aclmdlGetDynamicHW); + ASSIGN_SIMU(aclmdlGetInputDynamicDims); + ASSIGN_SIMU(aclmdlGetInputDynamicGearCount); + ASSIGN_SIMU(aclmdlGetDynamicBatch); + ASSIGN_SIMU(aclmdlSetDynamicHWSize); +#if defined(__linux__) && defined(WITH_BACKEND) + ASSIGN_SIMU(aclmdlRICaptureBegin); + ASSIGN_SIMU(aclmdlRICaptureGetInfo); + ASSIGN_SIMU(aclmdlRICaptureEnd); + ASSIGN_SIMU(aclmdlRIExecuteAsync); + ASSIGN_SIMU(aclmdlRIDestroy); +#endif +} +} // namespace mindspore::device::ascend diff --git a/inferrt/src/hardware/ascend/res_manager/symbol_interface/acl_mdl_symbol.h b/inferrt/src/hardware/ascend/res_manager/symbol_interface/acl_mdl_symbol.h new file mode 100644 index 0000000000000000000000000000000000000000..19abf85f91a0372fa0390ae5a58afab789454a46 --- /dev/null +++ b/inferrt/src/hardware/ascend/res_manager/symbol_interface/acl_mdl_symbol.h @@ -0,0 +1,130 @@ +/** + * Copyright 2024 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_CCSRC_TRANSFORM_SYMBOL_ACL_MDL_SYMBOL_H_ +#define MINDSPORE_CCSRC_TRANSFORM_SYMBOL_ACL_MDL_SYMBOL_H_ +#include +#include "acl/acl_mdl.h" +#include "hardware/hardware_abstract/dlopen_macro.h" + +namespace mindspore::device::ascend { +ORIGIN_METHOD_WITH_SIMU(aclmdlAddDatasetBuffer, aclError, aclmdlDataset *, aclDataBuffer *) +ORIGIN_METHOD_WITH_SIMU(aclmdlCreateDataset, aclmdlDataset *); +ORIGIN_METHOD_WITH_SIMU(aclmdlCreateDesc, aclmdlDesc *) +ORIGIN_METHOD_WITH_SIMU(aclmdlDestroyDataset, aclError, const aclmdlDataset *) +ORIGIN_METHOD_WITH_SIMU(aclmdlDestroyDesc, aclError, aclmdlDesc *) +ORIGIN_METHOD_WITH_SIMU(aclmdlExecute, aclError, uint32_t, const aclmdlDataset *, aclmdlDataset *) +ORIGIN_METHOD_WITH_SIMU(aclmdlFinalizeDump, aclError) +ORIGIN_METHOD_WITH_SIMU(aclmdlGetCurOutputDims, aclError, const aclmdlDesc *, size_t, aclmdlIODims *) +ORIGIN_METHOD_WITH_SIMU(aclmdlGetDatasetBuffer, aclDataBuffer *, const aclmdlDataset *, size_t) +ORIGIN_METHOD_WITH_SIMU(aclmdlGetDatasetNumBuffers, size_t, const aclmdlDataset *) +ORIGIN_METHOD_WITH_SIMU(aclmdlGetDesc, aclError, aclmdlDesc *, uint32_t) +ORIGIN_METHOD_WITH_SIMU(aclmdlGetInputDataType, aclDataType, const aclmdlDesc *, size_t) +ORIGIN_METHOD_WITH_SIMU(aclmdlGetInputDims, aclError, const aclmdlDesc *, size_t, aclmdlIODims *) +ORIGIN_METHOD_WITH_SIMU(aclmdlGetInputIndexByName, aclError, const aclmdlDesc *, const char *, size_t *) +ORIGIN_METHOD_WITH_SIMU(aclmdlGetInputNameByIndex, const char *, const aclmdlDesc *, size_t) +ORIGIN_METHOD_WITH_SIMU(aclmdlGetInputSizeByIndex, size_t, aclmdlDesc *, size_t index) +ORIGIN_METHOD_WITH_SIMU(aclmdlGetNumInputs, size_t, aclmdlDesc *) +ORIGIN_METHOD_WITH_SIMU(aclmdlGetNumOutputs, size_t, aclmdlDesc *) +ORIGIN_METHOD_WITH_SIMU(aclmdlGetOutputDataType, aclDataType, const aclmdlDesc *, size_t) +ORIGIN_METHOD_WITH_SIMU(aclmdlGetOutputDims, aclError, const aclmdlDesc *, size_t, aclmdlIODims *) +ORIGIN_METHOD_WITH_SIMU(aclmdlGetOutputNameByIndex, const char *, const aclmdlDesc *, size_t) +ORIGIN_METHOD_WITH_SIMU(aclmdlGetOutputSizeByIndex, size_t, aclmdlDesc *, size_t) +ORIGIN_METHOD_WITH_SIMU(aclmdlInitDump, aclError) +ORIGIN_METHOD_WITH_SIMU(aclmdlLoadFromMem, aclError, const void *, size_t, uint32_t *) +ORIGIN_METHOD_WITH_SIMU(aclmdlSetDump, aclError, const char *) +ORIGIN_METHOD_WITH_SIMU(aclmdlSetDynamicBatchSize, aclError, uint32_t, aclmdlDataset *, size_t, uint64_t) +ORIGIN_METHOD_WITH_SIMU(aclmdlUnload, aclError, uint32_t) +ORIGIN_METHOD_WITH_SIMU(aclmdlQuerySizeFromMem, aclError, const void *, size_t, size_t *, size_t *) +ORIGIN_METHOD_WITH_SIMU(aclmdlBundleGetModelId, aclError, uint32_t, size_t, uint32_t *) +ORIGIN_METHOD_WITH_SIMU(aclmdlBundleLoadFromMem, aclError, const void *, size_t, uint32_t *) +ORIGIN_METHOD_WITH_SIMU(aclmdlBundleUnload, aclError, uint32_t) +ORIGIN_METHOD_WITH_SIMU(aclmdlLoadFromMemWithMem, aclError, const void *, size_t, uint32_t *, void *, size_t, void *, + size_t) +ORIGIN_METHOD_WITH_SIMU(aclmdlSetDatasetTensorDesc, aclError, aclmdlDataset *, aclTensorDesc *, size_t) +ORIGIN_METHOD_WITH_SIMU(aclmdlGetInputFormat, aclFormat, const aclmdlDesc *, size_t) +ORIGIN_METHOD_WITH_SIMU(aclmdlGetDatasetTensorDesc, aclTensorDesc *, const aclmdlDataset *, size_t) +ORIGIN_METHOD_WITH_SIMU(aclmdlSetInputDynamicDims, aclError, uint32_t, aclmdlDataset *, size_t, const aclmdlIODims *) +ORIGIN_METHOD_WITH_SIMU(aclmdlGetOutputFormat, aclFormat, const aclmdlDesc *, size_t) +ORIGIN_METHOD_WITH_SIMU(aclmdlGetInputDimsV2, aclError, const aclmdlDesc *, size_t, aclmdlIODims *) +ORIGIN_METHOD_WITH_SIMU(aclmdlGetDynamicHW, aclError, const aclmdlDesc *, size_t, aclmdlHW *) +ORIGIN_METHOD_WITH_SIMU(aclmdlGetInputDynamicDims, aclError, const aclmdlDesc *, size_t, aclmdlIODims *, size_t) +ORIGIN_METHOD_WITH_SIMU(aclmdlGetInputDynamicGearCount, aclError, const aclmdlDesc *, size_t, size_t *) +ORIGIN_METHOD_WITH_SIMU(aclmdlGetDynamicBatch, aclError, const aclmdlDesc *, aclmdlBatch *) +ORIGIN_METHOD_WITH_SIMU(aclmdlSetDynamicHWSize, aclError, uint32_t, aclmdlDataset *, size_t, uint64_t, uint64_t) +#if defined(__linux__) && defined(WITH_BACKEND) +ORIGIN_METHOD_WITH_SIMU(aclmdlRICaptureBegin, aclError, aclrtStream, aclmdlRICaptureMode) +ORIGIN_METHOD_WITH_SIMU(aclmdlRICaptureGetInfo, aclError, aclrtStream, aclmdlRICaptureStatus *, aclmdlRI *) +ORIGIN_METHOD_WITH_SIMU(aclmdlRICaptureEnd, aclError, aclrtStream, aclmdlRI *) +ORIGIN_METHOD_WITH_SIMU(aclmdlRIExecuteAsync, aclError, aclmdlRI, aclrtStream) +ORIGIN_METHOD_WITH_SIMU(aclmdlRIDestroy, aclError, aclmdlRI) +#endif + +extern aclmdlAddDatasetBufferFunObj aclmdlAddDatasetBuffer_; +extern aclmdlCreateDatasetFunObj aclmdlCreateDataset_; +extern aclmdlCreateDescFunObj aclmdlCreateDesc_; +extern aclmdlDestroyDatasetFunObj aclmdlDestroyDataset_; +extern aclmdlDestroyDescFunObj aclmdlDestroyDesc_; +extern aclmdlExecuteFunObj aclmdlExecute_; +extern aclmdlFinalizeDumpFunObj aclmdlFinalizeDump_; +extern aclmdlGetCurOutputDimsFunObj aclmdlGetCurOutputDims_; +extern aclmdlGetDatasetBufferFunObj aclmdlGetDatasetBuffer_; +extern aclmdlGetDatasetNumBuffersFunObj aclmdlGetDatasetNumBuffers_; +extern aclmdlGetDescFunObj aclmdlGetDesc_; +extern aclmdlGetInputDataTypeFunObj aclmdlGetInputDataType_; +extern aclmdlGetInputDimsFunObj aclmdlGetInputDims_; +extern aclmdlGetInputIndexByNameFunObj aclmdlGetInputIndexByName_; +extern aclmdlGetInputNameByIndexFunObj aclmdlGetInputNameByIndex_; +extern aclmdlGetInputSizeByIndexFunObj aclmdlGetInputSizeByIndex_; +extern aclmdlGetNumInputsFunObj aclmdlGetNumInputs_; +extern aclmdlGetNumOutputsFunObj aclmdlGetNumOutputs_; +extern aclmdlGetOutputDataTypeFunObj aclmdlGetOutputDataType_; +extern aclmdlGetOutputDimsFunObj aclmdlGetOutputDims_; +extern aclmdlGetOutputNameByIndexFunObj aclmdlGetOutputNameByIndex_; +extern aclmdlGetOutputSizeByIndexFunObj aclmdlGetOutputSizeByIndex_; +extern aclmdlInitDumpFunObj aclmdlInitDump_; +extern aclmdlLoadFromMemFunObj aclmdlLoadFromMem_; +extern aclmdlSetDumpFunObj aclmdlSetDump_; +extern aclmdlSetDynamicBatchSizeFunObj aclmdlSetDynamicBatchSize_; +extern aclmdlUnloadFunObj aclmdlUnload_; +extern aclmdlQuerySizeFromMemFunObj aclmdlQuerySizeFromMem_; +extern aclmdlBundleGetModelIdFunObj aclmdlBundleGetModelId_; +extern aclmdlBundleLoadFromMemFunObj aclmdlBundleLoadFromMem_; +extern aclmdlBundleUnloadFunObj aclmdlBundleUnload_; +extern aclmdlLoadFromMemWithMemFunObj aclmdlLoadFromMemWithMem_; +extern aclmdlSetDatasetTensorDescFunObj aclmdlSetDatasetTensorDesc_; +extern aclmdlGetInputFormatFunObj aclmdlGetInputFormat_; +extern aclmdlGetDatasetTensorDescFunObj aclmdlGetDatasetTensorDesc_; +extern aclmdlSetInputDynamicDimsFunObj aclmdlSetInputDynamicDims_; +extern aclmdlGetOutputFormatFunObj aclmdlGetOutputFormat_; +extern aclmdlGetInputDimsV2FunObj aclmdlGetInputDimsV2_; +extern aclmdlGetDynamicHWFunObj aclmdlGetDynamicHW_; +extern aclmdlGetInputDynamicDimsFunObj aclmdlGetInputDynamicDims_; +extern aclmdlGetInputDynamicGearCountFunObj aclmdlGetInputDynamicGearCount_; +extern aclmdlGetDynamicBatchFunObj aclmdlGetDynamicBatch_; +extern aclmdlSetDynamicHWSizeFunObj aclmdlSetDynamicHWSize_; +#if defined(__linux__) && defined(WITH_BACKEND) +extern aclmdlRICaptureBeginFunObj aclmdlRICaptureBegin_; +extern aclmdlRICaptureGetInfoFunObj aclmdlRICaptureGetInfo_; +extern aclmdlRICaptureEndFunObj aclmdlRICaptureEnd_; +extern aclmdlRIExecuteAsyncFunObj aclmdlRIExecuteAsync_; +extern aclmdlRIDestroyFunObj aclmdlRIDestroy_; +#endif + +void LoadAclMdlApiSymbol(const std::string &ascend_path); +void LoadSimulationAclMdlApi(); +} // namespace mindspore::device::ascend + +#endif // MINDSPORE_CCSRC_TRANSFORM_SYMBOL_ACL_MDL_SYMBOL_H_ diff --git a/inferrt/src/hardware/ascend/res_manager/symbol_interface/acl_op_symbol.cc b/inferrt/src/hardware/ascend/res_manager/symbol_interface/acl_op_symbol.cc new file mode 100644 index 0000000000000000000000000000000000000000..2355e0705c4af92a9709bc2997496ecb12342fd6 --- /dev/null +++ b/inferrt/src/hardware/ascend/res_manager/symbol_interface/acl_op_symbol.cc @@ -0,0 +1,73 @@ +/** + * Copyright 2024 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "acl_op_symbol.h" +#include +#include "symbol_utils.h" + +namespace mindspore::device::ascend { +aclopCreateAttrFunObj aclopCreateAttr_ = nullptr; +aclopSetAttrBoolFunObj aclopSetAttrBool_ = nullptr; +aclopSetAttrDataTypeFunObj aclopSetAttrDataType_ = nullptr; +aclopSetAttrFloatFunObj aclopSetAttrFloat_ = nullptr; +aclopSetAttrIntFunObj aclopSetAttrInt_ = nullptr; +aclopSetAttrListBoolFunObj aclopSetAttrListBool_ = nullptr; +aclopSetAttrListDataTypeFunObj aclopSetAttrListDataType_ = nullptr; +aclopSetAttrListFloatFunObj aclopSetAttrListFloat_ = nullptr; +aclopSetAttrListIntFunObj aclopSetAttrListInt_ = nullptr; +aclopSetAttrListListIntFunObj aclopSetAttrListListInt_ = nullptr; +aclopSetAttrListStringFunObj aclopSetAttrListString_ = nullptr; +aclopSetAttrStringFunObj aclopSetAttrString_ = nullptr; +aclopSetModelDirFunObj aclopSetModelDir_ = nullptr; + +void LoadAclOpApiSymbol(const std::string &ascend_path) { + std::string ascendcl_plugin_path = ascend_path + "lib64/libascendcl.so"; + auto handler = GetLibHandler(ascendcl_plugin_path); + if (handler == nullptr) { + LOG_OUT << "Dlopen " << ascendcl_plugin_path << " failed!" << dlerror(); + return; + } + aclopCreateAttr_ = DlsymAscendFuncObj(aclopCreateAttr, handler); + aclopSetAttrBool_ = DlsymAscendFuncObj(aclopSetAttrBool, handler); + aclopSetAttrDataType_ = DlsymAscendFuncObj(aclopSetAttrDataType, handler); + aclopSetAttrFloat_ = DlsymAscendFuncObj(aclopSetAttrFloat, handler); + aclopSetAttrInt_ = DlsymAscendFuncObj(aclopSetAttrInt, handler); + aclopSetAttrListBool_ = DlsymAscendFuncObj(aclopSetAttrListBool, handler); + aclopSetAttrListDataType_ = DlsymAscendFuncObj(aclopSetAttrListDataType, handler); + aclopSetAttrListFloat_ = DlsymAscendFuncObj(aclopSetAttrListFloat, handler); + aclopSetAttrListInt_ = DlsymAscendFuncObj(aclopSetAttrListInt, handler); + aclopSetAttrListListInt_ = DlsymAscendFuncObj(aclopSetAttrListListInt, handler); + aclopSetAttrListString_ = DlsymAscendFuncObj(aclopSetAttrListString, handler); + aclopSetAttrString_ = DlsymAscendFuncObj(aclopSetAttrString, handler); + aclopSetModelDir_ = DlsymAscendFuncObj(aclopSetModelDir, handler); + LOG_OUT << "Load ascend op api success!"; +} + +void LoadSimulationAclOpApi() { + ASSIGN_SIMU(aclopCreateAttr); + ASSIGN_SIMU(aclopSetAttrBool); + ASSIGN_SIMU(aclopSetAttrDataType); + ASSIGN_SIMU(aclopSetAttrFloat); + ASSIGN_SIMU(aclopSetAttrInt); + ASSIGN_SIMU(aclopSetAttrListBool); + ASSIGN_SIMU(aclopSetAttrListDataType); + ASSIGN_SIMU(aclopSetAttrListFloat); + ASSIGN_SIMU(aclopSetAttrListInt); + ASSIGN_SIMU(aclopSetAttrListListInt); + ASSIGN_SIMU(aclopSetAttrListString); + ASSIGN_SIMU(aclopSetAttrString); + ASSIGN_SIMU(aclopSetModelDir); +} +} // namespace mindspore::device::ascend diff --git a/inferrt/src/hardware/ascend/res_manager/symbol_interface/acl_op_symbol.h b/inferrt/src/hardware/ascend/res_manager/symbol_interface/acl_op_symbol.h new file mode 100644 index 0000000000000000000000000000000000000000..af7adb3a71c6eb32d19ab140464cb4cd2e0a46d3 --- /dev/null +++ b/inferrt/src/hardware/ascend/res_manager/symbol_interface/acl_op_symbol.h @@ -0,0 +1,57 @@ +/** + * Copyright 2024 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_CCSRC_TRANSFORM_SYMBOL_ACL_OP_SYMBOL_H_ +#define MINDSPORE_CCSRC_TRANSFORM_SYMBOL_ACL_OP_SYMBOL_H_ +#include +#include "acl/acl_op.h" +#include "hardware/hardware_abstract/dlopen_macro.h" + +namespace mindspore::device::ascend { + +ORIGIN_METHOD_WITH_SIMU(aclopCreateAttr, aclopAttr *) +ORIGIN_METHOD_WITH_SIMU(aclopSetAttrBool, aclError, aclopAttr *, const char *, uint8_t) +ORIGIN_METHOD_WITH_SIMU(aclopSetAttrDataType, aclError, aclopAttr *, const char *, aclDataType) +ORIGIN_METHOD_WITH_SIMU(aclopSetAttrFloat, aclError, aclopAttr *, const char *, float) +ORIGIN_METHOD_WITH_SIMU(aclopSetAttrInt, aclError, aclopAttr *, const char *, int64_t) +ORIGIN_METHOD_WITH_SIMU(aclopSetAttrListBool, aclError, aclopAttr *, const char *, int, const uint8_t *) +ORIGIN_METHOD_WITH_SIMU(aclopSetAttrListDataType, aclError, aclopAttr *, const char *, int, const aclDataType[]) +ORIGIN_METHOD_WITH_SIMU(aclopSetAttrListFloat, aclError, aclopAttr *, const char *, int, const float *) +ORIGIN_METHOD_WITH_SIMU(aclopSetAttrListInt, aclError, aclopAttr *, const char *, int, const int64_t *) +ORIGIN_METHOD_WITH_SIMU(aclopSetAttrListListInt, aclError, aclopAttr *, const char *, int, const int *, + const int64_t *const[]) +ORIGIN_METHOD_WITH_SIMU(aclopSetAttrListString, aclError, aclopAttr *, const char *, int, const char **) +ORIGIN_METHOD_WITH_SIMU(aclopSetAttrString, aclError, aclopAttr *, const char *, const char *) +ORIGIN_METHOD_WITH_SIMU(aclopSetModelDir, aclError, const char *) + +extern aclopCreateAttrFunObj aclopCreateAttr_; +extern aclopSetAttrBoolFunObj aclopSetAttrBool_; +extern aclopSetAttrDataTypeFunObj aclopSetAttrDataType_; +extern aclopSetAttrFloatFunObj aclopSetAttrFloat_; +extern aclopSetAttrIntFunObj aclopSetAttrInt_; +extern aclopSetAttrListBoolFunObj aclopSetAttrListBool_; +extern aclopSetAttrListDataTypeFunObj aclopSetAttrListDataType_; +extern aclopSetAttrListFloatFunObj aclopSetAttrListFloat_; +extern aclopSetAttrListIntFunObj aclopSetAttrListInt_; +extern aclopSetAttrListListIntFunObj aclopSetAttrListListInt_; +extern aclopSetAttrListStringFunObj aclopSetAttrListString_; +extern aclopSetAttrStringFunObj aclopSetAttrString_; +extern aclopSetModelDirFunObj aclopSetModelDir_; + +void LoadAclOpApiSymbol(const std::string &ascend_path); +void LoadSimulationAclOpApi(); +} // namespace mindspore::device::ascend + +#endif // MINDSPORE_CCSRC_TRANSFORM_SYMBOL_ACL_OP_SYMBOL_H_ diff --git a/inferrt/src/hardware/ascend/res_manager/symbol_interface/acl_rt_allocator_symbol.cc b/inferrt/src/hardware/ascend/res_manager/symbol_interface/acl_rt_allocator_symbol.cc new file mode 100644 index 0000000000000000000000000000000000000000..885886059d26a5a73a819e3af51dc1c8c37bc25b --- /dev/null +++ b/inferrt/src/hardware/ascend/res_manager/symbol_interface/acl_rt_allocator_symbol.cc @@ -0,0 +1,62 @@ +/** + * Copyright 2024 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "acl_rt_allocator_symbol.h" +#include +#include "symbol_utils.h" + +namespace mindspore::device::ascend { +aclrtAllocatorCreateDescFunObj aclrtAllocatorCreateDesc_ = nullptr; +aclrtAllocatorDestroyDescFunObj aclrtAllocatorDestroyDesc_ = nullptr; +aclrtAllocatorRegisterFunObj aclrtAllocatorRegister_ = nullptr; +aclrtAllocatorSetAllocAdviseFuncToDescFunObj aclrtAllocatorSetAllocAdviseFuncToDesc_ = nullptr; +aclrtAllocatorSetAllocFuncToDescFunObj aclrtAllocatorSetAllocFuncToDesc_ = nullptr; +aclrtAllocatorSetFreeFuncToDescFunObj aclrtAllocatorSetFreeFuncToDesc_ = nullptr; +aclrtAllocatorSetGetAddrFromBlockFuncToDescFunObj aclrtAllocatorSetGetAddrFromBlockFuncToDesc_ = nullptr; +aclrtAllocatorSetObjToDescFunObj aclrtAllocatorSetObjToDesc_ = nullptr; +aclrtAllocatorUnregisterFunObj aclrtAllocatorUnregister_ = nullptr; + +void LoadAclAllocatorApiSymbol(const std::string &ascend_path) { + std::string allocator_plugin_path = ascend_path + "lib64/libascendcl.so"; + auto handler = GetLibHandler(allocator_plugin_path); + if (handler == nullptr) { + LOG_OUT << "Dlopen " << allocator_plugin_path << " failed!" << dlerror(); + return; + } + aclrtAllocatorCreateDesc_ = DlsymAscendFuncObj(aclrtAllocatorCreateDesc, handler); + aclrtAllocatorDestroyDesc_ = DlsymAscendFuncObj(aclrtAllocatorDestroyDesc, handler); + aclrtAllocatorRegister_ = DlsymAscendFuncObj(aclrtAllocatorRegister, handler); + aclrtAllocatorSetAllocAdviseFuncToDesc_ = DlsymAscendFuncObj(aclrtAllocatorSetAllocAdviseFuncToDesc, handler); + aclrtAllocatorSetAllocFuncToDesc_ = DlsymAscendFuncObj(aclrtAllocatorSetAllocFuncToDesc, handler); + aclrtAllocatorSetFreeFuncToDesc_ = DlsymAscendFuncObj(aclrtAllocatorSetFreeFuncToDesc, handler); + aclrtAllocatorSetGetAddrFromBlockFuncToDesc_ = + DlsymAscendFuncObj(aclrtAllocatorSetGetAddrFromBlockFuncToDesc, handler); + aclrtAllocatorSetObjToDesc_ = DlsymAscendFuncObj(aclrtAllocatorSetObjToDesc, handler); + aclrtAllocatorUnregister_ = DlsymAscendFuncObj(aclrtAllocatorUnregister, handler); + LOG_OUT << "Load acl allocator api success!"; +} + +void LoadSimulationAclAllocatorApi() { + ASSIGN_SIMU(aclrtAllocatorCreateDesc); + ASSIGN_SIMU(aclrtAllocatorDestroyDesc); + ASSIGN_SIMU(aclrtAllocatorRegister); + ASSIGN_SIMU(aclrtAllocatorSetAllocAdviseFuncToDesc); + ASSIGN_SIMU(aclrtAllocatorSetAllocFuncToDesc); + ASSIGN_SIMU(aclrtAllocatorSetFreeFuncToDesc); + ASSIGN_SIMU(aclrtAllocatorSetGetAddrFromBlockFuncToDesc); + ASSIGN_SIMU(aclrtAllocatorSetObjToDesc); + ASSIGN_SIMU(aclrtAllocatorUnregister); +} +} // namespace mindspore::device::ascend diff --git a/inferrt/src/hardware/ascend/res_manager/symbol_interface/acl_rt_allocator_symbol.h b/inferrt/src/hardware/ascend/res_manager/symbol_interface/acl_rt_allocator_symbol.h new file mode 100644 index 0000000000000000000000000000000000000000..e9d29334059ebb9ca1798253c00a66f12d82298c --- /dev/null +++ b/inferrt/src/hardware/ascend/res_manager/symbol_interface/acl_rt_allocator_symbol.h @@ -0,0 +1,49 @@ +/** + * Copyright 2024 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_CCSRC_TRANSFORM_SYMBOL_ACL_RT_ALLOCATOR_SYMBOL_H_ +#define MINDSPORE_CCSRC_TRANSFORM_SYMBOL_ACL_RT_ALLOCATOR_SYMBOL_H_ +#include +#include "acl/acl_rt_allocator.h" +#include "hardware/hardware_abstract/dlopen_macro.h" + +namespace mindspore::device::ascend { +ORIGIN_METHOD_WITH_SIMU(aclrtAllocatorCreateDesc, aclrtAllocatorDesc) +ORIGIN_METHOD_WITH_SIMU(aclrtAllocatorDestroyDesc, aclError, aclrtAllocatorDesc) +ORIGIN_METHOD_WITH_SIMU(aclrtAllocatorRegister, aclError, aclrtStream, aclrtAllocatorDesc) +ORIGIN_METHOD_WITH_SIMU(aclrtAllocatorSetAllocAdviseFuncToDesc, aclError, aclrtAllocatorDesc, + aclrtAllocatorAllocAdviseFunc) +ORIGIN_METHOD_WITH_SIMU(aclrtAllocatorSetAllocFuncToDesc, aclError, aclrtAllocatorDesc, aclrtAllocatorAllocFunc) +ORIGIN_METHOD_WITH_SIMU(aclrtAllocatorSetFreeFuncToDesc, aclError, aclrtAllocatorDesc, aclrtAllocatorFreeFunc) +ORIGIN_METHOD_WITH_SIMU(aclrtAllocatorSetGetAddrFromBlockFuncToDesc, aclError, aclrtAllocatorDesc, + aclrtAllocatorGetAddrFromBlockFunc) +ORIGIN_METHOD_WITH_SIMU(aclrtAllocatorSetObjToDesc, aclError, aclrtAllocatorDesc, aclrtAllocator) +ORIGIN_METHOD_WITH_SIMU(aclrtAllocatorUnregister, aclError, aclrtStream) + +extern aclrtAllocatorCreateDescFunObj aclrtAllocatorCreateDesc_; +extern aclrtAllocatorDestroyDescFunObj aclrtAllocatorDestroyDesc_; +extern aclrtAllocatorRegisterFunObj aclrtAllocatorRegister_; +extern aclrtAllocatorSetAllocAdviseFuncToDescFunObj aclrtAllocatorSetAllocAdviseFuncToDesc_; +extern aclrtAllocatorSetAllocFuncToDescFunObj aclrtAllocatorSetAllocFuncToDesc_; +extern aclrtAllocatorSetFreeFuncToDescFunObj aclrtAllocatorSetFreeFuncToDesc_; +extern aclrtAllocatorSetGetAddrFromBlockFuncToDescFunObj aclrtAllocatorSetGetAddrFromBlockFuncToDesc_; +extern aclrtAllocatorSetObjToDescFunObj aclrtAllocatorSetObjToDesc_; +extern aclrtAllocatorUnregisterFunObj aclrtAllocatorUnregister_; + +void LoadAclAllocatorApiSymbol(const std::string &ascend_path); +void LoadSimulationAclAllocatorApi(); +} // namespace mindspore::device::ascend + +#endif // MINDSPORE_CCSRC_TRANSFORM_SYMBOL_ACL_RT_ALLOCATOR_SYMBOL_H_ diff --git a/inferrt/src/hardware/ascend/res_manager/symbol_interface/acl_rt_symbol.cc b/inferrt/src/hardware/ascend/res_manager/symbol_interface/acl_rt_symbol.cc new file mode 100644 index 0000000000000000000000000000000000000000..5ddc52205ee05278cc6d1bab942c68d8f61d1633 --- /dev/null +++ b/inferrt/src/hardware/ascend/res_manager/symbol_interface/acl_rt_symbol.cc @@ -0,0 +1,228 @@ +/** + * Copyright 2024 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "acl_rt_symbol.h" +#include +#include "symbol_utils.h" + +int (*aclrt_get_last_error)(int) = nullptr; +const char *(*acl_get_recent_err_msg)() = nullptr; +namespace mindspore::device::ascend { +aclrtCreateContextFunObj aclrtCreateContext_ = nullptr; +aclrtCreateEventFunObj aclrtCreateEvent_ = nullptr; +aclrtCreateEventWithFlagFunObj aclrtCreateEventWithFlag_ = nullptr; +aclrtCreateEventExWithFlagFunObj aclrtCreateEventExWithFlag_ = nullptr; +aclrtCreateStreamWithConfigFunObj aclrtCreateStreamWithConfig_ = nullptr; +aclrtDestroyContextFunObj aclrtDestroyContext_ = nullptr; +aclrtDestroyEventFunObj aclrtDestroyEvent_ = nullptr; +aclrtDestroyStreamFunObj aclrtDestroyStream_ = nullptr; +aclrtDestroyStreamForceFunObj aclrtDestroyStreamForce_ = nullptr; +aclrtEventElapsedTimeFunObj aclrtEventElapsedTime_ = nullptr; +aclrtFreeFunObj aclrtFree_ = nullptr; +aclrtFreeHostFunObj aclrtFreeHost_ = nullptr; +aclrtGetCurrentContextFunObj aclrtGetCurrentContext_ = nullptr; +aclrtGetDeviceFunObj aclrtGetDevice_ = nullptr; +aclrtGetDeviceCountFunObj aclrtGetDeviceCount_ = nullptr; +aclrtGetDeviceIdFromExceptionInfoFunObj aclrtGetDeviceIdFromExceptionInfo_ = nullptr; +aclrtGetErrorCodeFromExceptionInfoFunObj aclrtGetErrorCodeFromExceptionInfo_ = nullptr; +aclrtGetMemInfoFunObj aclrtGetMemInfo_ = nullptr; +aclrtGetRunModeFunObj aclrtGetRunMode_ = nullptr; +aclrtGetStreamIdFromExceptionInfoFunObj aclrtGetStreamIdFromExceptionInfo_ = nullptr; +aclrtGetTaskIdFromExceptionInfoFunObj aclrtGetTaskIdFromExceptionInfo_ = nullptr; +aclrtGetThreadIdFromExceptionInfoFunObj aclrtGetThreadIdFromExceptionInfo_ = nullptr; +aclrtLaunchCallbackFunObj aclrtLaunchCallback_ = nullptr; +aclrtMallocFunObj aclrtMalloc_ = nullptr; +aclrtMallocHostFunObj aclrtMallocHost_ = nullptr; +aclrtMemcpyFunObj aclrtMemcpy_ = nullptr; +aclrtMemcpyAsyncFunObj aclrtMemcpyAsync_ = nullptr; +aclrtMemsetFunObj aclrtMemset_ = nullptr; +aclrtMemsetAsyncFunObj aclrtMemsetAsync_ = nullptr; +aclrtProcessReportFunObj aclrtProcessReport_ = nullptr; +aclrtQueryEventStatusFunObj aclrtQueryEventStatus_ = nullptr; +aclrtRecordEventFunObj aclrtRecordEvent_ = nullptr; +aclrtResetDeviceFunObj aclrtResetDevice_ = nullptr; +aclrtResetEventFunObj aclrtResetEvent_ = nullptr; +aclrtSetCurrentContextFunObj aclrtSetCurrentContext_ = nullptr; +aclrtSetDeviceFunObj aclrtSetDevice_ = nullptr; +aclrtSetDeviceSatModeFunObj aclrtSetDeviceSatMode_ = nullptr; +aclrtSetExceptionInfoCallbackFunObj aclrtSetExceptionInfoCallback_ = nullptr; +aclrtSetOpExecuteTimeOutFunObj aclrtSetOpExecuteTimeOut_ = nullptr; +aclrtSetOpWaitTimeoutFunObj aclrtSetOpWaitTimeout_ = nullptr; +aclrtSetStreamFailureModeFunObj aclrtSetStreamFailureMode_ = nullptr; +aclrtStreamQueryFunObj aclrtStreamQuery_ = nullptr; +aclrtStreamWaitEventFunObj aclrtStreamWaitEvent_ = nullptr; +aclrtSubscribeReportFunObj aclrtSubscribeReport_ = nullptr; +aclrtSynchronizeEventFunObj aclrtSynchronizeEvent_ = nullptr; +aclrtSynchronizeStreamFunObj aclrtSynchronizeStream_ = nullptr; +aclrtSynchronizeStreamWithTimeoutFunObj aclrtSynchronizeStreamWithTimeout_ = nullptr; +aclrtSynchronizeDeviceWithTimeoutFunObj aclrtSynchronizeDeviceWithTimeout_ = nullptr; +aclrtUnmapMemFunObj aclrtUnmapMem_ = nullptr; +aclrtReserveMemAddressFunObj aclrtReserveMemAddress_ = nullptr; +aclrtMallocPhysicalFunObj aclrtMallocPhysical_ = nullptr; +aclrtMapMemFunObj aclrtMapMem_ = nullptr; +aclrtFreePhysicalFunObj aclrtFreePhysical_ = nullptr; +aclrtReleaseMemAddressFunObj aclrtReleaseMemAddress_ = nullptr; +aclrtCtxSetSysParamOptFunObj aclrtCtxSetSysParamOpt_ = nullptr; +aclrtGetMemUceInfoFunObj aclrtGetMemUceInfo_ = nullptr; +aclrtDeviceTaskAbortFunObj aclrtDeviceTaskAbort_ = nullptr; +aclrtMemUceRepairFunObj aclrtMemUceRepair_ = nullptr; +aclrtEventGetTimestampFunObj aclrtEventGetTimestamp_ = nullptr; +aclrtDeviceGetBareTgidFunObj aclrtDeviceGetBareTgid_ = nullptr; +aclrtMemExportToShareableHandleFunObj aclrtMemExportToShareableHandle_ = nullptr; +aclrtMemSetPidToShareableHandleFunObj aclrtMemSetPidToShareableHandle_ = nullptr; +aclrtMemImportFromShareableHandleFunObj aclrtMemImportFromShareableHandle_ = nullptr; +aclrtGetLastErrorFunObj aclrtGetLastError_ = nullptr; + +void LoadAclRtApiSymbol(const std::string &ascend_path) { + std::string aclrt_plugin_path = ascend_path + "lib64/libascendcl.so"; + auto handler = GetLibHandler(aclrt_plugin_path); + if (handler == nullptr) { + LOG_OUT << "Dlopen " << aclrt_plugin_path << " failed!" << dlerror(); + return; + } + aclrtCreateContext_ = DlsymAscendFuncObj(aclrtCreateContext, handler); + aclrtCreateEvent_ = DlsymAscendFuncObj(aclrtCreateEvent, handler); + aclrtCreateEventWithFlag_ = DlsymAscendFuncObj(aclrtCreateEventWithFlag, handler); + aclrtCreateEventExWithFlag_ = DlsymAscendFuncObj(aclrtCreateEventExWithFlag, handler); + aclrtCreateStreamWithConfig_ = DlsymAscendFuncObj(aclrtCreateStreamWithConfig, handler); + aclrtDestroyContext_ = DlsymAscendFuncObj(aclrtDestroyContext, handler); + aclrtDestroyEvent_ = DlsymAscendFuncObj(aclrtDestroyEvent, handler); + aclrtDestroyStream_ = DlsymAscendFuncObj(aclrtDestroyStream, handler); + aclrtDestroyStreamForce_ = DlsymAscendFuncObj(aclrtDestroyStreamForce, handler); + aclrtEventElapsedTime_ = DlsymAscendFuncObj(aclrtEventElapsedTime, handler); + aclrtFree_ = DlsymAscendFuncObj(aclrtFree, handler); + aclrtFreeHost_ = DlsymAscendFuncObj(aclrtFreeHost, handler); + aclrtGetCurrentContext_ = DlsymAscendFuncObj(aclrtGetCurrentContext, handler); + aclrtGetDevice_ = DlsymAscendFuncObj(aclrtGetDevice, handler); + aclrtGetDeviceCount_ = DlsymAscendFuncObj(aclrtGetDeviceCount, handler); + aclrtGetDeviceIdFromExceptionInfo_ = DlsymAscendFuncObj(aclrtGetDeviceIdFromExceptionInfo, handler); + aclrtGetErrorCodeFromExceptionInfo_ = DlsymAscendFuncObj(aclrtGetErrorCodeFromExceptionInfo, handler); + aclrtGetMemInfo_ = DlsymAscendFuncObj(aclrtGetMemInfo, handler); + aclrtGetRunMode_ = DlsymAscendFuncObj(aclrtGetRunMode, handler); + aclrtGetStreamIdFromExceptionInfo_ = DlsymAscendFuncObj(aclrtGetStreamIdFromExceptionInfo, handler); + aclrtGetTaskIdFromExceptionInfo_ = DlsymAscendFuncObj(aclrtGetTaskIdFromExceptionInfo, handler); + aclrtGetThreadIdFromExceptionInfo_ = DlsymAscendFuncObj(aclrtGetThreadIdFromExceptionInfo, handler); + aclrtLaunchCallback_ = DlsymAscendFuncObj(aclrtLaunchCallback, handler); + aclrtMalloc_ = DlsymAscendFuncObj(aclrtMalloc, handler); + aclrtMallocHost_ = DlsymAscendFuncObj(aclrtMallocHost, handler); + aclrtMemcpy_ = DlsymAscendFuncObj(aclrtMemcpy, handler); + aclrtMemcpyAsync_ = DlsymAscendFuncObj(aclrtMemcpyAsync, handler); + aclrtMemset_ = DlsymAscendFuncObj(aclrtMemset, handler); + aclrtMemsetAsync_ = DlsymAscendFuncObj(aclrtMemsetAsync, handler); + aclrtProcessReport_ = DlsymAscendFuncObj(aclrtProcessReport, handler); + aclrtQueryEventStatus_ = DlsymAscendFuncObj(aclrtQueryEventStatus, handler); + aclrtRecordEvent_ = DlsymAscendFuncObj(aclrtRecordEvent, handler); + aclrtResetDevice_ = DlsymAscendFuncObj(aclrtResetDevice, handler); + aclrtResetEvent_ = DlsymAscendFuncObj(aclrtResetEvent, handler); + aclrtSetCurrentContext_ = DlsymAscendFuncObj(aclrtSetCurrentContext, handler); + aclrtSetDevice_ = DlsymAscendFuncObj(aclrtSetDevice, handler); + aclrtSetDeviceSatMode_ = DlsymAscendFuncObj(aclrtSetDeviceSatMode, handler); + aclrtSetExceptionInfoCallback_ = DlsymAscendFuncObj(aclrtSetExceptionInfoCallback, handler); + aclrtSetOpExecuteTimeOut_ = DlsymAscendFuncObj(aclrtSetOpExecuteTimeOut, handler); + aclrtSetOpWaitTimeout_ = DlsymAscendFuncObj(aclrtSetOpWaitTimeout, handler); + aclrtSetStreamFailureMode_ = DlsymAscendFuncObj(aclrtSetStreamFailureMode, handler); + aclrtStreamQuery_ = DlsymAscendFuncObj(aclrtStreamQuery, handler); + aclrtStreamWaitEvent_ = DlsymAscendFuncObj(aclrtStreamWaitEvent, handler); + aclrtSubscribeReport_ = DlsymAscendFuncObj(aclrtSubscribeReport, handler); + aclrtSynchronizeEvent_ = DlsymAscendFuncObj(aclrtSynchronizeEvent, handler); + aclrtSynchronizeStream_ = DlsymAscendFuncObj(aclrtSynchronizeStream, handler); + aclrtSynchronizeStreamWithTimeout_ = DlsymAscendFuncObj(aclrtSynchronizeStreamWithTimeout, handler); + aclrtSynchronizeDeviceWithTimeout_ = DlsymAscendFuncObj(aclrtSynchronizeDeviceWithTimeout, handler); + aclrtUnmapMem_ = DlsymAscendFuncObj(aclrtUnmapMem, handler); + aclrtReserveMemAddress_ = DlsymAscendFuncObj(aclrtReserveMemAddress, handler); + aclrtMallocPhysical_ = DlsymAscendFuncObj(aclrtMallocPhysical, handler); + aclrtMapMem_ = DlsymAscendFuncObj(aclrtMapMem, handler); + aclrtFreePhysical_ = DlsymAscendFuncObj(aclrtFreePhysical, handler); + aclrtReleaseMemAddress_ = DlsymAscendFuncObj(aclrtReleaseMemAddress, handler); + aclrtCtxSetSysParamOpt_ = DlsymAscendFuncObj(aclrtCtxSetSysParamOpt, handler); + aclrtGetMemUceInfo_ = DlsymAscendFuncObj(aclrtGetMemUceInfo, handler); + aclrtDeviceTaskAbort_ = DlsymAscendFuncObj(aclrtDeviceTaskAbort, handler); + aclrtMemUceRepair_ = DlsymAscendFuncObj(aclrtMemUceRepair, handler); + aclrtEventGetTimestamp_ = DlsymAscendFuncObj(aclrtEventGetTimestamp, handler); + aclrtDeviceGetBareTgid_ = DlsymAscendFuncObj(aclrtDeviceGetBareTgid, handler); + aclrtMemExportToShareableHandle_ = DlsymAscendFuncObj(aclrtMemExportToShareableHandle, handler); + aclrtMemSetPidToShareableHandle_ = DlsymAscendFuncObj(aclrtMemSetPidToShareableHandle, handler); + aclrtMemImportFromShareableHandle_ = DlsymAscendFuncObj(aclrtMemImportFromShareableHandle, handler); + aclrtGetLastError_ = DlsymAscendFuncObj(aclrtGetLastError, handler); + LOG_OUT << "Load acl rt api success!"; +} + +void LoadSimulationRtApi() { + ASSIGN_SIMU(aclrtCreateContext); + ASSIGN_SIMU(aclrtCreateEvent); + ASSIGN_SIMU(aclrtCreateEventWithFlag); + ASSIGN_SIMU(aclrtCreateEventExWithFlag); + ASSIGN_SIMU(aclrtCreateStreamWithConfig); + ASSIGN_SIMU(aclrtDestroyContext); + ASSIGN_SIMU(aclrtDestroyEvent); + ASSIGN_SIMU(aclrtDestroyStream); + ASSIGN_SIMU(aclrtDestroyStreamForce); + ASSIGN_SIMU(aclrtEventElapsedTime); + ASSIGN_SIMU(aclrtFree); + ASSIGN_SIMU(aclrtFreeHost); + ASSIGN_SIMU(aclrtGetCurrentContext); + ASSIGN_SIMU(aclrtGetDevice); + ASSIGN_SIMU(aclrtGetDeviceCount); + ASSIGN_SIMU(aclrtGetDeviceIdFromExceptionInfo); + ASSIGN_SIMU(aclrtGetErrorCodeFromExceptionInfo); + ASSIGN_SIMU(aclrtGetMemInfo); + ASSIGN_SIMU(aclrtGetRunMode); + ASSIGN_SIMU(aclrtGetStreamIdFromExceptionInfo); + ASSIGN_SIMU(aclrtGetTaskIdFromExceptionInfo); + ASSIGN_SIMU(aclrtGetThreadIdFromExceptionInfo); + ASSIGN_SIMU(aclrtLaunchCallback); + ASSIGN_SIMU(aclrtMalloc); + ASSIGN_SIMU(aclrtMallocHost); + ASSIGN_SIMU(aclrtMemcpy); + ASSIGN_SIMU(aclrtMemcpyAsync); + ASSIGN_SIMU(aclrtMemset); + ASSIGN_SIMU(aclrtMemsetAsync); + ASSIGN_SIMU(aclrtProcessReport); + ASSIGN_SIMU(aclrtQueryEventStatus); + ASSIGN_SIMU(aclrtRecordEvent); + ASSIGN_SIMU(aclrtResetDevice); + ASSIGN_SIMU(aclrtResetEvent); + ASSIGN_SIMU(aclrtSetCurrentContext); + ASSIGN_SIMU(aclrtSetDevice); + ASSIGN_SIMU(aclrtSetDeviceSatMode); + ASSIGN_SIMU(aclrtSetExceptionInfoCallback); + ASSIGN_SIMU(aclrtSetOpExecuteTimeOut); + ASSIGN_SIMU(aclrtSetOpWaitTimeout); + ASSIGN_SIMU(aclrtSetStreamFailureMode); + ASSIGN_SIMU(aclrtStreamQuery); + ASSIGN_SIMU(aclrtStreamWaitEvent); + ASSIGN_SIMU(aclrtSubscribeReport); + ASSIGN_SIMU(aclrtSynchronizeEvent); + ASSIGN_SIMU(aclrtSynchronizeStream); + ASSIGN_SIMU(aclrtSynchronizeStreamWithTimeout); + ASSIGN_SIMU(aclrtSynchronizeDeviceWithTimeout); + ASSIGN_SIMU(aclrtUnmapMem); + ASSIGN_SIMU(aclrtReserveMemAddress); + ASSIGN_SIMU(aclrtMallocPhysical); + ASSIGN_SIMU(aclrtMapMem); + ASSIGN_SIMU(aclrtFreePhysical); + ASSIGN_SIMU(aclrtReleaseMemAddress); + ASSIGN_SIMU(aclrtCtxSetSysParamOpt); + ASSIGN_SIMU(aclrtGetMemUceInfo); + ASSIGN_SIMU(aclrtDeviceTaskAbort); + ASSIGN_SIMU(aclrtMemUceRepair); + ASSIGN_SIMU(aclrtEventGetTimestamp); + ASSIGN_SIMU(aclrtDeviceGetBareTgid); + ASSIGN_SIMU(aclrtMemExportToShareableHandle); + ASSIGN_SIMU(aclrtMemSetPidToShareableHandle); + ASSIGN_SIMU(aclrtMemImportFromShareableHandle); + ASSIGN_SIMU(aclrtGetLastError); +} +} // namespace mindspore::device::ascend diff --git a/inferrt/src/hardware/ascend/res_manager/symbol_interface/acl_rt_symbol.h b/inferrt/src/hardware/ascend/res_manager/symbol_interface/acl_rt_symbol.h new file mode 100644 index 0000000000000000000000000000000000000000..e53dae76e782fb7fc8f43b65c638e2e208ced791 --- /dev/null +++ b/inferrt/src/hardware/ascend/res_manager/symbol_interface/acl_rt_symbol.h @@ -0,0 +1,94 @@ +/** + * Copyright 2024 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_CCSRC_TRANSFORM_SYMBOL_ACL_RT_SYMBOL_H_ +#define MINDSPORE_CCSRC_TRANSFORM_SYMBOL_ACL_RT_SYMBOL_H_ +#include +#include "acl/acl_rt.h" +#include "hardware/hardware_abstract/dlopen_macro.h" + +namespace mindspore::device::ascend { +ORIGIN_METHOD_WITH_SIMU_CREATE(aclrtCreateContext, aclError, aclrtContext *, int32_t) +ORIGIN_METHOD_WITH_SIMU_CREATE(aclrtCreateEvent, aclError, aclrtEvent *) +ORIGIN_METHOD_WITH_SIMU_CREATE(aclrtCreateEventWithFlag, aclError, aclrtEvent *, uint32_t) +ORIGIN_METHOD_WITH_SIMU_CREATE(aclrtCreateEventExWithFlag, aclError, aclrtEvent *, uint32_t) +ORIGIN_METHOD_WITH_SIMU_CREATE(aclrtCreateStreamWithConfig, aclError, aclrtStream *, uint32_t, uint32_t) +ORIGIN_METHOD_WITH_SIMU(aclrtDestroyContext, aclError, aclrtContext) +ORIGIN_METHOD_WITH_SIMU(aclrtDestroyEvent, aclError, aclrtEvent) +ORIGIN_METHOD_WITH_SIMU(aclrtDestroyStream, aclError, aclrtStream) +ORIGIN_METHOD_WITH_SIMU(aclrtDestroyStreamForce, aclError, aclrtStream) +ORIGIN_METHOD_WITH_SIMU_CREATE(aclrtEventElapsedTime, aclError, float *, aclrtEvent, aclrtEvent) +ORIGIN_METHOD_WITH_SIMU(aclrtFree, aclError, void *) +ORIGIN_METHOD_WITH_SIMU(aclrtFreeHost, aclError, void *) +ORIGIN_METHOD_WITH_SIMU_CREATE(aclrtGetCurrentContext, aclError, aclrtContext *) +ORIGIN_METHOD_WITH_SIMU_CREATE(aclrtGetDevice, aclError, int32_t *) +ORIGIN_METHOD_WITH_SIMU_CREATE(aclrtGetDeviceCount, aclError, uint32_t *) +ORIGIN_METHOD_WITH_SIMU(aclrtGetDeviceIdFromExceptionInfo, uint32_t, const aclrtExceptionInfo *) +ORIGIN_METHOD_WITH_SIMU(aclrtGetErrorCodeFromExceptionInfo, uint32_t, const aclrtExceptionInfo *) +ORIGIN_METHOD_WITH_SIMU(aclrtGetMemInfo, aclError, aclrtMemAttr, size_t *, size_t *) +ORIGIN_METHOD_WITH_SIMU_CREATE(aclrtGetRunMode, aclError, aclrtRunMode *) +ORIGIN_METHOD_WITH_SIMU(aclrtGetStreamIdFromExceptionInfo, uint32_t, const aclrtExceptionInfo *) +ORIGIN_METHOD_WITH_SIMU(aclrtGetTaskIdFromExceptionInfo, uint32_t, const aclrtExceptionInfo *) +ORIGIN_METHOD_WITH_SIMU(aclrtGetThreadIdFromExceptionInfo, uint32_t, const aclrtExceptionInfo *) +ORIGIN_METHOD_WITH_SIMU(aclrtLaunchCallback, aclError, aclrtCallback, void *, aclrtCallbackBlockType, aclrtStream) +ORIGIN_METHOD_WITH_SIMU_CREATE(aclrtMalloc, aclError, void **, size_t, aclrtMemMallocPolicy) +ORIGIN_METHOD_WITH_SIMU_CREATE(aclrtMallocHost, aclError, void **, size_t) +ORIGIN_METHOD_WITH_SIMU(aclrtMemcpy, aclError, void *, size_t, const void *, size_t, aclrtMemcpyKind) +ORIGIN_METHOD_WITH_SIMU(aclrtMemcpyAsync, aclError, void *, size_t, const void *, size_t, aclrtMemcpyKind, aclrtStream) +ORIGIN_METHOD_WITH_SIMU(aclrtMemset, aclError, void *, size_t, int32_t, size_t) +ORIGIN_METHOD_WITH_SIMU(aclrtMemsetAsync, aclError, void *, size_t, int32_t, size_t, aclrtStream) +ORIGIN_METHOD_WITH_SIMU(aclrtProcessReport, aclError, int32_t) +ORIGIN_METHOD_WITH_SIMU(aclrtQueryEventStatus, aclError, aclrtEvent, aclrtEventRecordedStatus *) +ORIGIN_METHOD_WITH_SIMU(aclrtRecordEvent, aclError, aclrtEvent, aclrtStream) +ORIGIN_METHOD_WITH_SIMU(aclrtResetDevice, aclError, int32_t) +ORIGIN_METHOD_WITH_SIMU(aclrtResetEvent, aclError, aclrtEvent, aclrtStream) +ORIGIN_METHOD_WITH_SIMU(aclrtSetCurrentContext, aclError, aclrtContext) +ORIGIN_METHOD_WITH_SIMU(aclrtSetDevice, aclError, int32_t) +ORIGIN_METHOD_WITH_SIMU(aclrtSetDeviceSatMode, aclError, aclrtFloatOverflowMode) +ORIGIN_METHOD_WITH_SIMU(aclrtSetExceptionInfoCallback, aclError, aclrtExceptionInfoCallback) +ORIGIN_METHOD_WITH_SIMU(aclrtSetOpExecuteTimeOut, aclError, uint32_t) +ORIGIN_METHOD_WITH_SIMU(aclrtSetOpWaitTimeout, aclError, uint32_t) +ORIGIN_METHOD_WITH_SIMU(aclrtSetStreamFailureMode, aclError, aclrtStream, uint64_t) +ORIGIN_METHOD_WITH_SIMU(aclrtStreamQuery, aclError, aclrtStream, aclrtStreamStatus *) +ORIGIN_METHOD_WITH_SIMU(aclrtStreamWaitEvent, aclError, aclrtStream, aclrtEvent) +ORIGIN_METHOD_WITH_SIMU(aclrtSubscribeReport, aclError, uint64_t, aclrtStream) +ORIGIN_METHOD_WITH_SIMU(aclrtSynchronizeEvent, aclError, aclrtEvent) +ORIGIN_METHOD_WITH_SIMU(aclrtSynchronizeStream, aclError, aclrtStream) +ORIGIN_METHOD_WITH_SIMU(aclrtSynchronizeStreamWithTimeout, aclError, aclrtStream, int32_t) +ORIGIN_METHOD_WITH_SIMU(aclrtSynchronizeDeviceWithTimeout, aclError, int32_t) +ORIGIN_METHOD_WITH_SIMU(aclrtUnmapMem, aclError, void *) +ORIGIN_METHOD_WITH_SIMU(aclrtReserveMemAddress, aclError, void **, size_t, size_t, void *, uint64_t) +ORIGIN_METHOD_WITH_SIMU(aclrtMallocPhysical, aclError, aclrtDrvMemHandle *, size_t, const aclrtPhysicalMemProp *, + uint64_t) +ORIGIN_METHOD_WITH_SIMU(aclrtMapMem, aclError, void *, size_t, size_t, aclrtDrvMemHandle, uint64_t) +ORIGIN_METHOD_WITH_SIMU(aclrtFreePhysical, aclError, aclrtDrvMemHandle) +ORIGIN_METHOD_WITH_SIMU(aclrtReleaseMemAddress, aclError, void *) +ORIGIN_METHOD_WITH_SIMU(aclrtCtxSetSysParamOpt, aclError, aclSysParamOpt, int64_t) +ORIGIN_METHOD_WITH_SIMU(aclrtGetMemUceInfo, aclError, int32_t, aclrtMemUceInfo *, size_t, size_t *) +ORIGIN_METHOD_WITH_SIMU(aclrtDeviceTaskAbort, aclError, int32_t, uint32_t) +ORIGIN_METHOD_WITH_SIMU(aclrtMemUceRepair, aclError, int32_t, aclrtMemUceInfo *, size_t) +ORIGIN_METHOD_WITH_SIMU(aclrtEventGetTimestamp, aclError, aclrtEvent, uint64_t *) +ORIGIN_METHOD_WITH_SIMU(aclrtDeviceGetBareTgid, aclError, int32_t *) +ORIGIN_METHOD_WITH_SIMU(aclrtMemExportToShareableHandle, aclError, aclrtDrvMemHandle, aclrtMemHandleType, uint64_t, + uint64_t *) +ORIGIN_METHOD_WITH_SIMU(aclrtMemSetPidToShareableHandle, aclError, uint64_t, int32_t *, size_t) +ORIGIN_METHOD_WITH_SIMU(aclrtMemImportFromShareableHandle, aclError, uint64_t, int32_t, aclrtDrvMemHandle *) +ORIGIN_METHOD_WITH_SIMU(aclrtGetLastError, aclError, aclrtLastErrLevel) + +void LoadAclRtApiSymbol(const std::string &ascend_path); +void LoadSimulationRtApi(); +} // namespace mindspore::device::ascend + +#endif // MINDSPORE_CCSRC_TRANSFORM_SYMBOL_ACL_RT_SYMBOL_H_ diff --git a/inferrt/src/hardware/ascend/res_manager/symbol_interface/acl_symbol.cc b/inferrt/src/hardware/ascend/res_manager/symbol_interface/acl_symbol.cc new file mode 100644 index 0000000000000000000000000000000000000000..706b4325a6f69dcec4cc75db5a0fb1d3b8aa2c4a --- /dev/null +++ b/inferrt/src/hardware/ascend/res_manager/symbol_interface/acl_symbol.cc @@ -0,0 +1,41 @@ +/** + * Copyright 2024 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "acl_symbol.h" +#include +#include "symbol_utils.h" + +namespace mindspore::device::ascend { + +aclInitFunObj aclInit_ = nullptr; +aclFinalizeFunObj aclFinalize_ = nullptr; + +void LoadAclApiSymbol(const std::string &ascend_path) { + std::string acl_plugin_path = ascend_path + "lib64/libascendcl.so"; + auto base_handler = GetLibHandler(acl_plugin_path); + if (base_handler == nullptr) { + LOG_OUT << "Dlopen " << acl_plugin_path << " failed!" << dlerror(); + return; + } + aclInit_ = DlsymAscendFuncObj(aclInit, base_handler); + aclFinalize_ = DlsymAscendFuncObj(aclFinalize, base_handler); + LOG_OUT << "Load acl base api success!"; +} + +void LoadSimulationAclApi() { + ASSIGN_SIMU(aclInit); + ASSIGN_SIMU(aclFinalize); +} +} // namespace mindspore::device::ascend diff --git a/inferrt/src/hardware/ascend/res_manager/symbol_interface/acl_symbol.h b/inferrt/src/hardware/ascend/res_manager/symbol_interface/acl_symbol.h new file mode 100644 index 0000000000000000000000000000000000000000..a38f0bbeab8cb11bf510f2f6ac2b38ed02b90230 --- /dev/null +++ b/inferrt/src/hardware/ascend/res_manager/symbol_interface/acl_symbol.h @@ -0,0 +1,35 @@ +/** + * Copyright 2024 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_CCSRC_TRANSFORM_SYMBOL_ACL_SYMBOL_H_ +#define MINDSPORE_CCSRC_TRANSFORM_SYMBOL_ACL_SYMBOL_H_ +#include +#include "acl/acl_rt_allocator.h" +#include "hardware/hardware_abstract/dlopen_macro.h" +#include "hardware/ascend/res_manager/symbol_interface/acl_base_symbol.h" + +namespace mindspore::device::ascend { + +ORIGIN_METHOD_WITH_SIMU(aclInit, aclError, const char *); +ORIGIN_METHOD_WITH_SIMU(aclFinalize, aclError); + +extern aclInitFunObj aclInit_; +extern aclFinalizeFunObj aclFinalize_; + +void LoadAclApiSymbol(const std::string &ascend_path); +void LoadSimulationAclApi(); +} // namespace mindspore::device::ascend + +#endif // MINDSPORE_CCSRC_TRANSFORM_SYMBOL_ACL_SYMBOL_H_ diff --git a/inferrt/src/hardware/ascend/res_manager/symbol_interface/acl_tdt_symbol.cc b/inferrt/src/hardware/ascend/res_manager/symbol_interface/acl_tdt_symbol.cc new file mode 100644 index 0000000000000000000000000000000000000000..b62cc9721c4339a4142468796decfb3111c41551 --- /dev/null +++ b/inferrt/src/hardware/ascend/res_manager/symbol_interface/acl_tdt_symbol.cc @@ -0,0 +1,123 @@ +/** + * Copyright 2024 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "acl_tdt_symbol.h" +#include +#include +#include "symbol_utils.h" + +namespace mindspore::device::ascend { + +acltdtAddDataItemFunObj acltdtAddDataItem_ = nullptr; +acltdtCleanChannelFunObj acltdtCleanChannel_ = nullptr; +acltdtCreateChannelFunObj acltdtCreateChannel_ = nullptr; +acltdtCreateChannelWithCapacityFunObj acltdtCreateChannelWithCapacity_ = nullptr; +acltdtCreateDataItemFunObj acltdtCreateDataItem_ = nullptr; +acltdtCreateDatasetFunObj acltdtCreateDataset_ = nullptr; +acltdtDestroyChannelFunObj acltdtDestroyChannel_ = nullptr; +acltdtDestroyDataItemFunObj acltdtDestroyDataItem_ = nullptr; +acltdtDestroyDatasetFunObj acltdtDestroyDataset_ = nullptr; +acltdtGetDataAddrFromItemFunObj acltdtGetDataAddrFromItem_ = nullptr; +acltdtGetDataItemFunObj acltdtGetDataItem_ = nullptr; +acltdtGetDatasetNameFunObj acltdtGetDatasetName_ = nullptr; +acltdtGetDatasetSizeFunObj acltdtGetDatasetSize_ = nullptr; +acltdtGetDataSizeFromItemFunObj acltdtGetDataSizeFromItem_ = nullptr; +acltdtGetDataTypeFromItemFunObj acltdtGetDataTypeFromItem_ = nullptr; +acltdtGetDimNumFromItemFunObj acltdtGetDimNumFromItem_ = nullptr; +acltdtGetDimsFromItemFunObj acltdtGetDimsFromItem_ = nullptr; +acltdtGetTensorTypeFromItemFunObj acltdtGetTensorTypeFromItem_ = nullptr; +acltdtGetSliceInfoFromItemFunObj acltdtGetSliceInfoFromItem_ = nullptr; +acltdtQueryChannelSizeFunObj acltdtQueryChannelSize_ = nullptr; +acltdtReceiveTensorFunObj acltdtReceiveTensor_ = nullptr; +acltdtSendTensorFunObj acltdtSendTensor_ = nullptr; +acltdtStopChannelFunObj acltdtStopChannel_ = nullptr; + +void LoadAcltdtApiSymbol(const std::string &ascend_path) { + const std::vector depend_libs = {"libacl_tdt_queue.so"}; + for (const auto &dep_lib : depend_libs) { + (void)GetLibHandler(ascend_path + "lib64/" + dep_lib); + } + + std::string aclrt_tdt_path = ascend_path + "lib64/libacl_tdt_channel.so"; + auto handler = GetLibHandler(aclrt_tdt_path); + if (handler == nullptr) { + LOG_OUT << "Dlopen " << aclrt_tdt_path << " failed!" << dlerror(); + return; + } + acltdtAddDataItem_ = DlsymAscendFuncObj(acltdtAddDataItem, handler); + acltdtCleanChannel_ = DlsymAscendFuncObj(acltdtCleanChannel, handler); + acltdtCreateChannel_ = DlsymAscendFuncObj(acltdtCreateChannel, handler); + acltdtCreateChannelWithCapacity_ = DlsymAscendFuncObj(acltdtCreateChannelWithCapacity, handler); + acltdtCreateDataItem_ = DlsymAscendFuncObj(acltdtCreateDataItem, handler); + acltdtCreateDataset_ = DlsymAscendFuncObj(acltdtCreateDataset, handler); + acltdtDestroyChannel_ = DlsymAscendFuncObj(acltdtDestroyChannel, handler); + acltdtDestroyDataItem_ = DlsymAscendFuncObj(acltdtDestroyDataItem, handler); + acltdtDestroyDataset_ = DlsymAscendFuncObj(acltdtDestroyDataset, handler); + acltdtGetDataAddrFromItem_ = DlsymAscendFuncObj(acltdtGetDataAddrFromItem, handler); + acltdtGetDataItem_ = DlsymAscendFuncObj(acltdtGetDataItem, handler); + acltdtGetDatasetName_ = DlsymAscendFuncObj(acltdtGetDatasetName, handler); + acltdtGetDatasetSize_ = DlsymAscendFuncObj(acltdtGetDatasetSize, handler); + acltdtGetDataSizeFromItem_ = DlsymAscendFuncObj(acltdtGetDataSizeFromItem, handler); + acltdtGetDataTypeFromItem_ = DlsymAscendFuncObj(acltdtGetDataTypeFromItem, handler); + acltdtGetDimNumFromItem_ = DlsymAscendFuncObj(acltdtGetDimNumFromItem, handler); + acltdtGetDimsFromItem_ = DlsymAscendFuncObj(acltdtGetDimsFromItem, handler); + acltdtGetTensorTypeFromItem_ = DlsymAscendFuncObj(acltdtGetTensorTypeFromItem, handler); + acltdtGetSliceInfoFromItem_ = DlsymAscendFuncObj(acltdtGetSliceInfoFromItem, handler); + acltdtQueryChannelSize_ = DlsymAscendFuncObj(acltdtQueryChannelSize, handler); + acltdtReceiveTensor_ = DlsymAscendFuncObj(acltdtReceiveTensor, handler); + acltdtSendTensor_ = DlsymAscendFuncObj(acltdtSendTensor, handler); + acltdtStopChannel_ = DlsymAscendFuncObj(acltdtStopChannel, handler); + LOG_OUT << "Load acl tdt api success!"; +} + +void LoadSpecialSimulationTdtApi() { + acltdtQueryChannelSize_ = [](const acltdtChannelHandle *handle, size_t *ret_size_ptr) { + if (handle == nullptr) { + LOG_OUT << "Empty handle!"; + } + if (ret_size_ptr != nullptr) { + *ret_size_ptr = 1; + } + return ACL_SUCCESS; + }; +} + +void LoadSimulationTdtApi() { + ASSIGN_SIMU(acltdtAddDataItem); + ASSIGN_SIMU(acltdtCleanChannel); + ASSIGN_SIMU(acltdtCreateChannel); + ASSIGN_SIMU(acltdtCreateChannelWithCapacity); + ASSIGN_SIMU(acltdtCreateDataItem); + ASSIGN_SIMU(acltdtCreateDataset); + ASSIGN_SIMU(acltdtDestroyChannel); + ASSIGN_SIMU(acltdtDestroyDataItem); + ASSIGN_SIMU(acltdtDestroyDataset); + ASSIGN_SIMU(acltdtGetDataAddrFromItem); + ASSIGN_SIMU(acltdtGetDataItem); + ASSIGN_SIMU(acltdtGetDatasetName); + ASSIGN_SIMU(acltdtGetDatasetSize); + ASSIGN_SIMU(acltdtGetDataSizeFromItem); + ASSIGN_SIMU(acltdtGetDataTypeFromItem); + ASSIGN_SIMU(acltdtGetDimNumFromItem); + ASSIGN_SIMU(acltdtGetDimsFromItem); + ASSIGN_SIMU(acltdtGetTensorTypeFromItem); + ASSIGN_SIMU(acltdtGetSliceInfoFromItem); + ASSIGN_SIMU(acltdtQueryChannelSize); + ASSIGN_SIMU(acltdtReceiveTensor); + ASSIGN_SIMU(acltdtSendTensor); + ASSIGN_SIMU(acltdtStopChannel); + LoadSpecialSimulationTdtApi(); +} +} // namespace mindspore::device::ascend diff --git a/inferrt/src/hardware/ascend/res_manager/symbol_interface/acl_tdt_symbol.h b/inferrt/src/hardware/ascend/res_manager/symbol_interface/acl_tdt_symbol.h new file mode 100644 index 0000000000000000000000000000000000000000..1761f47a578d1f3bc50be855a373448fadd0b7f5 --- /dev/null +++ b/inferrt/src/hardware/ascend/res_manager/symbol_interface/acl_tdt_symbol.h @@ -0,0 +1,54 @@ +/** + * Copyright 2024 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_CCSRC_TRANSFORM_SYMBOL_ACL_TDT_SYMBOL_H_ +#define MINDSPORE_CCSRC_TRANSFORM_SYMBOL_ACL_TDT_SYMBOL_H_ +#include +#include +#include "acl/acl_tdt.h" +#include "hardware/hardware_abstract/dlopen_macro.h" + +namespace mindspore::device::ascend { + +ORIGIN_METHOD_WITH_SIMU(acltdtAddDataItem, aclError, acltdtDataset *, acltdtDataItem *) +ORIGIN_METHOD_WITH_SIMU(acltdtCleanChannel, aclError, acltdtChannelHandle *) +ORIGIN_METHOD_WITH_SIMU(acltdtCreateChannel, acltdtChannelHandle *, uint32_t, const char *) +ORIGIN_METHOD_WITH_SIMU(acltdtCreateChannelWithCapacity, acltdtChannelHandle *, uint32_t, const char *, size_t) +ORIGIN_METHOD_WITH_SIMU(acltdtCreateDataItem, acltdtDataItem *, acltdtTensorType, const int64_t *, size_t, aclDataType, + void *, size_t) +ORIGIN_METHOD_WITH_SIMU(acltdtCreateDataset, acltdtDataset *) +ORIGIN_METHOD_WITH_SIMU(acltdtDestroyChannel, aclError, acltdtChannelHandle *) +ORIGIN_METHOD_WITH_SIMU(acltdtDestroyDataItem, aclError, acltdtDataItem *) +ORIGIN_METHOD_WITH_SIMU(acltdtDestroyDataset, aclError, acltdtDataset *) +ORIGIN_METHOD_WITH_SIMU(acltdtGetDataAddrFromItem, void *, const acltdtDataItem *) +ORIGIN_METHOD_WITH_SIMU(acltdtGetDataItem, acltdtDataItem *, const acltdtDataset *, size_t) +ORIGIN_METHOD_WITH_SIMU(acltdtGetDatasetName, const char *, const acltdtDataset *) +ORIGIN_METHOD_WITH_SIMU(acltdtGetDatasetSize, size_t, const acltdtDataset *) +ORIGIN_METHOD_WITH_SIMU(acltdtGetDataSizeFromItem, size_t, const acltdtDataItem *) +ORIGIN_METHOD_WITH_SIMU(acltdtGetDataTypeFromItem, aclDataType, const acltdtDataItem *) +ORIGIN_METHOD_WITH_SIMU(acltdtGetDimNumFromItem, size_t, const acltdtDataItem *) +ORIGIN_METHOD_WITH_SIMU(acltdtGetDimsFromItem, aclError, const acltdtDataItem *, int64_t *, size_t) +ORIGIN_METHOD_WITH_SIMU(acltdtGetTensorTypeFromItem, acltdtTensorType, const acltdtDataItem *) +ORIGIN_METHOD_WITH_SIMU(acltdtGetSliceInfoFromItem, aclError, const acltdtDataItem *, size_t *, size_t *) +ORIGIN_METHOD_WITH_SIMU(acltdtQueryChannelSize, aclError, const acltdtChannelHandle *, size_t *) +ORIGIN_METHOD_WITH_SIMU(acltdtReceiveTensor, aclError, const acltdtChannelHandle *, acltdtDataset *, int32_t) +ORIGIN_METHOD_WITH_SIMU(acltdtSendTensor, aclError, const acltdtChannelHandle *, const acltdtDataset *, int32_t) +ORIGIN_METHOD_WITH_SIMU(acltdtStopChannel, aclError, acltdtChannelHandle *) + +void LoadAcltdtApiSymbol(const std::string &ascend_path); +void LoadSimulationTdtApi(); +} // namespace mindspore::device::ascend + +#endif // MINDSPORE_CCSRC_TRANSFORM_SYMBOL_ACL_TDT_SYMBOL_H_ diff --git a/inferrt/src/hardware/ascend/res_manager/symbol_interface/symbol_utils.cc b/inferrt/src/hardware/ascend/res_manager/symbol_interface/symbol_utils.cc new file mode 100644 index 0000000000000000000000000000000000000000..52b37bf43c5045bdb75a9c194cdbe6454aaee3b4 --- /dev/null +++ b/inferrt/src/hardware/ascend/res_manager/symbol_interface/symbol_utils.cc @@ -0,0 +1,96 @@ +/** + * Copyright 2024 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "symbol_utils.h" +#include +#include "acl_base_symbol.h" +#include "acl_compiler_symbol.h" +#include "acl_mdl_symbol.h" +#include "acl_op_symbol.h" +#include "acl_rt_allocator_symbol.h" +#include "acl_rt_symbol.h" +#include "acl_symbol.h" +#include "acl_tdt_symbol.h" + +namespace mindspore::device::ascend { + +static bool load_ascend_api = false; +static bool load_simulation_api = false; + +void *GetLibHandler(const std::string &lib_path, bool if_global) { + void *handler = nullptr; + if (if_global) { + handler = dlopen(lib_path.c_str(), RTLD_LAZY | RTLD_GLOBAL); + } else { + handler = dlopen(lib_path.c_str(), RTLD_LAZY | RTLD_LOCAL); + } + if (handler == nullptr) { + LOG_OUT << "Dlopen " << lib_path << " failed!" << dlerror(); + } + return handler; +} + +std::string GetAscendPath() { + Dl_info info; + if (dladdr(reinterpret_cast(aclrtMalloc), &info) == 0) { + LOG_ERROR << "Get dladdr failed."; + return ""; + } + auto path_tmp = std::string(info.dli_fname); + const std::string kLatest = "latest"; + auto pos = path_tmp.rfind(kLatest); + if (pos == std::string::npos) { + LOG_ERROR << "Get ascend path failed, please check whether CANN packages are installed correctly, \n" + "and environment variables are set by source ${LOCAL_ASCEND}/ascend-toolkit/set_env.sh."; + } + return path_tmp.substr(0, pos) + kLatest + "/"; +} + +void LoadAscendApiSymbols() { + if (load_ascend_api) { + LOG_OUT << "Ascend api is already loaded."; + return; + } + std::string ascend_path = GetAscendPath(); + LoadAclBaseApiSymbol(ascend_path); + LoadAclOpCompilerApiSymbol(ascend_path); + LoadAclMdlApiSymbol(ascend_path); + LoadAclOpApiSymbol(ascend_path); + LoadAclAllocatorApiSymbol(ascend_path); + LoadAclRtApiSymbol(ascend_path); + LoadAclApiSymbol(ascend_path); + LoadAcltdtApiSymbol(ascend_path); + load_ascend_api = true; + LOG_OUT << "Load ascend api success!"; +} + +void LoadSimulationApiSymbols() { + if (load_simulation_api) { + LOG_OUT << "Simulation api is already loaded."; + return; + } + + LoadSimulationAclBaseApi(); + LoadSimulationRtApi(); + LoadSimulationTdtApi(); + LoadSimulationAclOpCompilerApi(); + LoadSimulationAclMdlApi(); + LoadSimulationAclOpApi(); + LoadSimulationAclAllocatorApi(); + LoadSimulationAclApi(); + load_simulation_api = true; + LOG_OUT << "Load simulation api success!"; +} +} // namespace mindspore::device::ascend diff --git a/inferrt/src/hardware/ascend/res_manager/symbol_interface/symbol_utils.h b/inferrt/src/hardware/ascend/res_manager/symbol_interface/symbol_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..8d6538aa1e9ea2130f5046b524be9ed4402f9221 --- /dev/null +++ b/inferrt/src/hardware/ascend/res_manager/symbol_interface/symbol_utils.h @@ -0,0 +1,85 @@ +/** + * Copyright 2024 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_TRANSFORM_SYMBOL_SYMBOL_UTILS_H_ +#define MINDSPORE_CCSRC_TRANSFORM_SYMBOL_SYMBOL_UTILS_H_ +#include +#include "common/common.h" +#include "acl/acl.h" +#include "hardware/hardware_abstract/visible.h" + +extern "C" HARDWARE_EXPORT int (*aclrt_get_last_error)(int); + +#ifndef ACL_ERROR_RT_DEVICE_MEM_ERROR +#define ACL_ERROR_RT_DEVICE_MEM_ERROR 507053 +#endif +#ifndef ACL_ERROR_RT_HBM_MULTI_BIT_ECC_ERROR +#define ACL_ERROR_RT_HBM_MULTI_BIT_ECC_ERROR 507054 +#endif +#ifndef ACL_ERROR_RT_COMM_OP_RETRY_FAIL +#define ACL_ERROR_RT_COMM_OP_RETRY_FAIL 507904 +#endif +#ifndef ACL_ERROR_RT_DEVICE_TASK_ABORT +#define ACL_ERROR_RT_DEVICE_TASK_ABORT 107022 +#endif +const int thread_level = 0; + +template +auto RunAscendApi(Function f, int line, const char *call_f, const char *func_name, Args... args) { + if (f == nullptr) { + LOG_ERROR << func_name << " is null."; + } + + if constexpr (std::is_same_v, int>) { + auto ret = f(args...); + return ret; + } else { + return f(args...); + } +} + +template +auto RunAscendApi(Function f, int line, const char *call_f, const char *func_name) { + if (f == nullptr) { + LOG_ERROR << func_name << " is null."; + } + if constexpr (std::is_same_v, int>) { + auto ret = f(); + return ret; + } else { + return f(); + } +} + +template +bool HasAscendApi(Function f) { + return f != nullptr; +} + +namespace mindspore::device::ascend { + +#define CALL_ASCEND_API(func_name, ...) \ + RunAscendApi(mindspore::device::ascend::func_name##_, __LINE__, __FUNCTION__, #func_name, ##__VA_ARGS__) + +#define HAS_ASCEND_API(func_name) HasAscendApi(mindspore::device::ascend::func_name##_) + +std::string GetAscendPath(); +void *GetLibHandler(const std::string &lib_path, bool if_global = false); +void LoadAscendApiSymbols(); +void LoadSimulationApiSymbols(); +} // namespace mindspore::device::ascend + +#endif // MINDSPORE_CCSRC_TRANSFORM_SYMBOL_SYMBOL_UTILS_H_ diff --git a/inferrt/src/hardware/cpu/CMakeLists.txt b/inferrt/src/hardware/cpu/CMakeLists.txt index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..985e282b325ed23f7f5748c170443dd7040a18bc 100644 --- a/inferrt/src/hardware/cpu/CMakeLists.txt +++ b/inferrt/src/hardware/cpu/CMakeLists.txt @@ -0,0 +1,4 @@ +# check_debug_log_out() + +# file(GLOB_RECURSE HARDWARE_CPU_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc") +# add_library(hardware_cpu_obj STATIC ${HARDWARE_CPU_SRC_FILES}) \ No newline at end of file diff --git a/inferrt/src/hardware/cpu/cpu_device_context.cc b/inferrt/src/hardware/cpu/cpu_device_context.cc new file mode 100644 index 0000000000000000000000000000000000000000..fd1aa5dc7c0387a9023ac67d260138d4d7dcae50 --- /dev/null +++ b/inferrt/src/hardware/cpu/cpu_device_context.cc @@ -0,0 +1,641 @@ +/** + * Copyright 2021-2025 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "hardware/cpu/cpu_device_context.h" +#include +#include +#include +#include + +#include "hardware/cpu/res_manager/mem_manager/cpu_memory_manager.h" + +namespace mindspore { +namespace device { +namespace cpu { +namespace { +const char kModelNameCPU[] = "CPU"; +const char kEventOptimizeGraph[] = "OptimizeGraph"; +const char kStageSetKernelInfo[] = "SetKernelInfo"; + +std::pair MatchMultiDynamicKernelAttr(const kernel::KernelAttr &kernel_attr, + const std::vector &dyn_input_sizes, + const std::vector &kernel_attr_list) { + auto output_num = kernel_attr.GetOutputSize(); + for (size_t index = 0; index < kernel_attr_list.size(); ++index) { + // support multi dynamic inputs. + const auto &cur_kernel_attr = kernel_attr_list[index]; + auto cur_input_num = cur_kernel_attr.GetInputSize(); + if (dyn_input_sizes.size() != cur_input_num) { + MS_LOG(EXCEPTION) << "Kernel attr's input num: " << cur_input_num + << ", is not equal to dynamic input size: " << dyn_input_sizes.size(); + } + bool mis_match = false; + size_t input_index = 0; + for (size_t i = 0; i < cur_input_num; ++i) { + int64_t dyn_input_size = dyn_input_sizes[i]; + if (dyn_input_size < 0) { + dyn_input_size = 1; + } + auto dtype = cur_kernel_attr.GetInputAttr(i).dtype; + for (size_t j = 0; j < LongToSize(dyn_input_size); ++j) { + if (kernel_attr.GetInputAttr(input_index).dtype != dtype) { + mis_match = true; + break; + } + ++input_index; + } + if (mis_match) { + break; + } + } + if (mis_match) { + continue; + } + + // only support one dynamic output. TODO: support multi dynamic output. + for (size_t i = 0; i < output_num; ++i) { + auto dtype = cur_kernel_attr.GetOutputAttr(i).dtype; + if (kernel_attr.GetInputAttr(i).dtype != dtype) { + mis_match = true; + break; + } + } + if (!mis_match) { + return std::make_pair(true, index); + } + } + return std::make_pair(false, 0); +} + +runtime::KernelTaskPtr GetTaskByTaskType(const runtime::KernelTaskType &task_type, + const std::shared_ptr &task_context) { + switch (task_type) { + case runtime::KernelTaskType::kCONTIGUOUS_TASK: + return std::make_shared(task_context); + case runtime::KernelTaskType::kCOPY_TASK: + return std::make_shared(task_context); + default: + MS_LOG(EXCEPTION) << "KernelTaskType is invalid, task_type:" << task_type; + } +} + +void MallocMemoryForDeviceAddress(device::DeviceAddress *device_address, const device::DeviceContext *device_context) { + MS_EXCEPTION_IF_NULL(device_address); + device::tracker::CALL_MEMORY_TRACKER_WITH_FILE(AddTask, "Graph", "Contiguous", ""); + device::tracker::CALL_MEMORY_TRACKER_WITH_FILE(AddMemInfo, "Graph", device::tracker::MemType::kPyNativeOutput, + device_address->GetSize(), device_address); + if (device_address->GetPtr() == nullptr) { + if (!device_context->device_res_manager_->AllocateMemory(device_address)) { + MS_LOG(EXCEPTION) << "Allocate device memory failed!"; + } + } +} + +} // namespace + +void SetCpuRefMapToKernelInfo(const CNodePtr &apply_kernel, const std::vector &apply_kernel_attrs) { + MS_EXCEPTION_IF_NULL(apply_kernel); + auto kernel_attrs = apply_kernel_attrs; + if (kernel_attrs.empty()) { + return; + } + + auto build_info = AnfAlgo::GetSelectKernelBuildInfo(apply_kernel); + MS_EXCEPTION_IF_NULL(build_info); + auto kernel_attr = GetKernelAttrFromBuildInfo(build_info); + std::vector dyn_input_sizes = {}; + if (common::AnfAlgo::HasNodeAttr(kAttrDynInputSizes, apply_kernel)) { + dyn_input_sizes = common::AnfAlgo::GetNodeAttr>(apply_kernel, kAttrDynInputSizes); + } + std::pair match_result; + + if (kernel_attrs[0].GetSkipCheck()) { + // If kernel skips attr check, we need to synchronize the ref map in case it's discarded. + SyncOutInRef(kernel_attrs[0], &kernel_attr); + kernel_attrs[0] = kernel_attr; + match_result = {true, 0}; + } else if (dyn_input_sizes.empty() || kernel_attrs[0].GetAllSame()) { + match_result = MatchKernelAttr(kernel_attr, kernel_attrs); + } else { + match_result = MatchMultiDynamicKernelAttr(kernel_attr, dyn_input_sizes, kernel_attrs); + } + + auto [is_match, index] = match_result; + if (!is_match) { + MS_LOG_WITH_NODE(EXCEPTION, apply_kernel) + << apply_kernel->fullname_with_scope() << " does not support this kernel data type: " << build_info->ToString() + << ", node debug name: " << apply_kernel->DebugString(AnfNode::DebugStringLevel::kLevel2); + } + + auto kernel_info = dynamic_cast(apply_kernel->kernel_info()); + MS_EXCEPTION_IF_NULL(kernel_info); + const auto &matched_kernel_attr = kernel_attrs[index]; + if (!matched_kernel_attr.GetOutInRefMap().empty() || matched_kernel_attr.GetAllOutInRef()) { + kernel_info->set_ref_map(matched_kernel_attr.GetAllOutInRef(), matched_kernel_attr.GetOutInRefMap()); + } +} + +using mindspore::kernel::KernelBuildInfo; + +void CPUDeviceContext::Initialize() { +#ifdef __APPLE__ + std::lock_guard spin_lock(init_lock_); +#else + std::lock_guard lock(init_mutex_); +#endif + if (initialized_) { + return; + } + MS_EXCEPTION_IF_NULL(device_res_manager_); + device_res_manager_->Initialize(); + auto ms_context = MsContext::GetInstance(); + MS_EXCEPTION_IF_NULL(ms_context); + if (ms_context->get_param(MS_CTX_DEVICE_TARGET) == kCPUDevice) { + // Dump json config file if dump is enabled. + uint32_t rank_id = 0; + auto &json_parser = DumpJsonParser::GetInstance(); + json_parser.Parse(); + json_parser.CopyDumpJsonToDir(rank_id); + json_parser.CopyMSCfgJsonToDir(rank_id); + } +#ifdef __linux__ + if (ms_context->IsDefaultDeviceTarget() && ms_context->get_param(MS_CTX_DEVICE_TARGET) == kCPUDevice) { + MS_LOG(INFO) + << "No device_target set, set CPU as default. You can call mindspore.set_context(device_target=\"XXX\")"; + } +#endif // __linux__ + initialized_ = true; +} + +void CPUDeviceContext::Destroy() { + MS_EXCEPTION_IF_NULL(device_res_manager_); + device_res_manager_->Destroy(); + initialized_ = false; +} + +void CPUKernelExecutor::OptimizeGraph(const FuncGraphPtr &graph) const { + MS_EXCEPTION_IF_NULL(graph); + auto kernel_graph = graph->cast(); + MS_EXCEPTION_IF_NULL(kernel_graph); + auto ms_context = MsContext::GetInstance(); + MS_EXCEPTION_IF_NULL(ms_context); + auto enable_lazy_inline = ms_context->CellReuseLevel() != CellReuseLevel::kNoCellReuse; + if (enable_lazy_inline) { + MS_LOG(EXCEPTION) << "CPU does not support the lazy_inline feature, " + << "please do not mark @lazy_inline in cell's __init__ func."; + } + if (kernel_graph->is_from_single_op()) { + SetOperatorInfo(kernel_graph); + SingleOpGraphOptimize(kernel_graph); + UpdateKernelRefInfo(kernel_graph); + } else { + // The passes in this function must be before ops select: SetOperatorInfo() + OptimizeMindIR(kernel_graph); + // Update Graph Dynamic Shape Attr. + opt::AddDynamicShapeAttrPass(kernel_graph); + + SetOperatorInfo(kernel_graph); + // SetOperatorInfo may generate new node, so need set kernel object type again. + kernel_graph->SetKernelObjectTypesForUnrealNodes(); +#ifdef ENABLE_DUMP_IR + if (ms_context->CanDump(kIntroductory)) { + DumpIR("hwopt_comm_after_kernel_select_" + graph->ToString() + ".ir", graph, true); + } +#endif + + OptimizeGraphImpl(kernel_graph); + + // Run final optimization. + opt::CommonFinalOptimization(kernel_graph); + + // Run graph kernel fusion optimization + if (graphkernel::GraphKernelFlags::GetInstance().IsEnableGraphKernel()) { + graphkernel::GraphKernelOptimize(kernel_graph); + kernel_graph->SetExecOrderByDefault(); + } + } +} + +void CPUKernelExecutor::UpdateKernelRefInfo(const KernelGraphPtr &graph) const { + MS_EXCEPTION_IF_NULL(graph); + const std::vector &kernels = graph->execution_order(); + for (const auto &kernel : kernels) { + MS_EXCEPTION_IF_NULL(kernel); + const std::string &op_name = common::AnfAlgo::GetCNodeName(kernel); + if (IsPrimitiveCNode(kernel, prim::kPrimCustom) && + mindspore::kernel::OpLib::FindOp(op_name, kernel::OpImplyType::kImplyCPU) == nullptr) { + MS_LOG(DEBUG) << "Not find operator information for Custom operator [" << op_name << "]"; + return; + } + + auto kernel_attr_list = kernel::NativeCpuKernelMod::GetCpuSupportedList(op_name); + if (kernel_attr_list.empty()) { + MS_LOG(DEBUG) << "kernel_attr_list is empty"; + return; + } + + auto kernel_info = dynamic_cast(kernel->kernel_info()); + MS_EXCEPTION_IF_NULL(kernel_info); + kernel_info->set_ref_map(kernel_attr_list[0].GetAllOutInRef(), kernel_attr_list[0].GetOutInRefMap()); + } +} + +void CPUKernelExecutor::OptimizeMindIR(const KernelGraphPtr &graph) const { + MS_EXCEPTION_IF_NULL(graph); + auto optimizer = std::make_shared(); + auto pm = std::make_shared(); + pm->AddPass(std::make_shared("softmax_grad_fusion_cpu")); + // Match MatMul+BiasAdd+ReLU first, if no match, then match MatMul+BiasAdd + pm->AddPass(std::make_shared("matmul_biasadd_relu_fusion_cpu")); + pm->AddPass(std::make_shared()); + + // Do communication op fusion before InsertTensorMoveForCommunication pass. + // So these passes are before kernel select process, no need to generate kernel build info in them. + if (parallel::ParallelContext::GetInstance()->enable_all_reduce_fusion()) { + MS_LOG(INFO) << "Parallel comm_fusion of AllReduce is enabled."; + pm->AddPass(std::make_shared()); + } + + optimizer->AddPassManager(pm); + (void)optimizer->Optimize(graph); + graph->SetExecOrderByDefault(); +} + +void CPUKernelExecutor::OptimizeGraphImpl(const KernelGraphPtr &graph) const { + MS_EXCEPTION_IF_NULL(graph); + auto optimizer = std::make_shared(); + auto pm = std::make_shared(); + pm->AddPass(std::make_shared("insert_type_transform_op")); + pm->AddPass(std::make_shared("flatten_value_sequence_in_pyexecute")); + pm->AddPass(std::make_shared("insert_format_transform_op_cpu")); + pm->AddPass(std::make_shared("insert_cast")); + pm->AddPass(std::make_shared()); + pm->AddPass(std::make_shared()); + pm->AddPass(std::make_shared()); + pm->AddPass(std::make_shared("print_value_type")); + pm->AddPass(std::make_shared("insert_cast_for_pyexecute")); + optimizer->AddPassManager(pm); + (void)optimizer->Optimize(graph); + graph->SetExecOrderByDefault(); +} + +void CPUKernelExecutor::SingleOpGraphOptimize(const KernelGraphPtr &graph) const { + MS_EXCEPTION_IF_NULL(graph); + auto optimizer = std::make_shared(); + auto pm = std::make_shared(); + pm->AddPass(std::make_shared("insert_cast")); + optimizer->AddPassManager(pm); + (void)optimizer->Optimize(graph); + graph->SetExecOrderByDefault(); +} + +namespace { +void SetControlOpInfo(const CNodePtr &kernel_node) { + MS_EXCEPTION_IF_NULL(kernel_node); + std::vector inputs_format; + std::vector inputs_type; + size_t input_num = common::AnfAlgo::GetInputTensorNum(kernel_node); + for (size_t input_index = 0; input_index < input_num; ++input_index) { + (void)inputs_format.emplace_back(kOpFormat_DEFAULT); + inputs_type.push_back(common::AnfAlgo::GetPrevNodeOutputInferDataType(kernel_node, input_index)); + } + std::vector outputs_format; + std::vector outputs_type; + size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node); + for (size_t output_index = 0; output_index < output_num; ++output_index) { + (void)outputs_format.emplace_back(kOpFormat_DEFAULT); + outputs_type.push_back(common::AnfAlgo::GetOutputInferDataType(kernel_node, output_index)); + } + + auto builder = std::make_shared(); + builder->SetInputsFormat(inputs_format); + builder->SetInputsDeviceType(inputs_type); + builder->SetOutputsFormat(outputs_format); + builder->SetOutputsDeviceType(outputs_type); + + AnfAlgo::SetSelectKernelBuildInfo(builder->Build(), kernel_node.get()); +} + +// Before creating the kernel, check whether the node has completed the operator selection. If not, the operator +// selection needs to be performed to set kernel info. +void SetKernelInfoBeforeCreateKernel(const std::vector &nodes) { + // Check whether the node has completed operator selection. + for (const auto &node : nodes) { + if (AnfAlgo::GetSelectKernelBuildInfo(node) != nullptr) { + continue; + } + + // Kernel selection process for non control op. + if (!common::AnfAlgo::IsBpropCutOpExecInBackend(node)) { + auto [msg, etype] = SetKernelInfoWithMsg(node); + if (!msg.empty()) { + MS_EXCEPTION(etype) << "#umsg#Kernel select failed:#umsg#" << msg; + } + } else { + // Kernel selection process for control op. + SetControlOpInfo(node); + } + } +} +} // namespace + +void CPUKernelExecutor::SetOperatorInfo(const KernelGraphPtr &graph) const { + MS_EXCEPTION_IF_NULL(graph); + uint64_t start_time = profiler::GetClockSyscnt(); + bool do_expand = false; + auto mng = graph->manager(); + if (mng == nullptr) { + mng = Manage(graph, true); + MS_EXCEPTION_IF_NULL(mng); + graph->set_manager(mng); + } + auto &node_list = graph->execution_order(); + for (auto &node : node_list) { + if (!common::AnfAlgo::IsBpropCutOpExecInBackend(node)) { + auto [msg, etype] = SetKernelInfoWithMsg(node); + if (msg.empty()) { + continue; + } + auto f = [](const CNodePtr &n) { + auto res = SetKernelInfoWithMsg(n); + return res.first.empty(); + }; + auto expand_ret = expander::TryExpandCNode(node, f); + if (!expand_ret) { + MS_EXCEPTION(etype) << "#umsg#Kernel select failed:#umsg#" << msg + << "\nnode: " << node->DebugString(AnfNode::DebugStringLevel::kLevel2); + } + MS_LOG(INFO) << msg << " but expand success."; + do_expand = true; + } else { + SetControlOpInfo(node); + } + } + if (do_expand) { + (void)opt::BindValueToGraph().Run(graph); + graph->SetExecOrderByDefault(); + } + (void)profiler::CollectHostInfo(kModelNameCPU, kEventOptimizeGraph, kStageSetKernelInfo, start_time, + profiler::GetClockSyscnt(), 1); +} + +kernel::KernelModPtr CPUKernelExecutor::CreateKernelMod(const std::string &op_name) const { + if (kernel::IsOpPluginKernel(op_name)) { + return kernel::Factory::Instance().Create(op_name); + } + return kernel::Factory::Instance().Create(op_name); +} + +void CPUKernelExecutor::CreateKernel(const std::vector &nodes) const { + SetKernelInfoBeforeCreateKernel(nodes); + + kernel::KernelMeta *bin_map = kernel::KernelMeta::GetInstance(); + std::vector akg_nodes; + for (const auto &node : nodes) { + MS_EXCEPTION_IF_NULL(node); + if (common::AnfAlgo::IsBpropCutOpExecInBackend(node)) { + continue; + } + if (session::AnfRuntimeAlgorithm::GetKernelType(node) == KernelType::AKG_KERNEL) { + if (!bin_map->initialized()) { + bin_map->Initialize(); + } + akg_nodes.push_back(node); + continue; + } + std::string kernel_name = common::AnfAlgo::GetCNodeName(node); + + std::shared_ptr cpu_kernel = + std::dynamic_pointer_cast(CreateKernelMod(kernel_name)); + + if (cpu_kernel == nullptr) { + MS_LOG(INTERNAL_EXCEPTION) << "#dmsg#Kernel build failed:#dmsg#Build cpu operator[" << node->fullname_with_scope() + << "] failed"; + } + + auto kernel_attrs = cpu_kernel->GetOpSupport(); + SetCpuRefMapToKernelInfo(node, kernel_attrs); + auto thread_pool = kernel::GetActorMgrInnerThreadPool(); + cpu_kernel->SetThreadPool(thread_pool); + std::vector input_kernel_tensors = AnfAlgo::GetOrCreateAllInputKernelTensors(node); + std::vector output_kernel_tensors = AnfAlgo::GetOrCreateAllOutputKernelTensors(node); + auto ret = cpu_kernel->Init(common::AnfAlgo::GetCNodePrimitive(node), input_kernel_tensors, output_kernel_tensors); + if (!ret) { + MS_LOG_WITH_NODE(EXCEPTION, node) << trace::DumpSourceLines(node); + } + if (kernel::CheckResizeCondition(node)) { + if (cpu_kernel->Resize(input_kernel_tensors, output_kernel_tensors) == kernel::KRET_RESIZE_FAILED) { + MS_LOG(INTERNAL_EXCEPTION) << "#dmsg#Kernel build failed:#dmsg#CPU kernel op [" << node->fullname_with_scope() + << "] resize failed."; + } + } + + AnfAlgo::SetKernelMod(cpu_kernel, node.get()); + } +#ifdef ENABLE_AKG + kernel::AkgCpuKernelBuilder akg_cpu_kernel_builder; + (void)akg_cpu_kernel_builder.SingleOpParallelBuild(akg_nodes); +#endif +} + +std::vector CPUKernelExecutor::GetLaunchIgnoredInputAddressIdx(const AnfNodePtr &node) const { + MS_EXCEPTION_IF_NULL(node); + auto kernel_info = dynamic_cast(node->kernel_info()); + MS_EXCEPTION_IF_NULL(kernel_info); + auto kernel_mod = kernel_info->MutableKernelMod(); + MS_EXCEPTION_IF_NULL(kernel_mod); + return kernel_mod->GetLaunchIgnoredInputAddressIdx(); +} + +bool CPUKernelExecutor::IsLaunchIgnoredInputAddressIdx(const AnfNodePtr &node, size_t input_idx) const { + auto ignored_input_list = GetLaunchIgnoredInputAddressIdx(node); + if (std::find(ignored_input_list.begin(), ignored_input_list.end(), input_idx) != ignored_input_list.end()) { + return true; + } + return false; +} + +void CPUKernelExecutor::PreprocessBeforeRun(const FuncGraphPtr &graph) const { + MS_EXCEPTION_IF_NULL(graph); + auto kernel_graph = graph->cast(); + MS_EXCEPTION_IF_NULL(kernel_graph); + if (!kernel_graph->is_from_single_op()) { + // Remove reorder after PS feature finish adapting push/pull in auto_monad. + auto execution_order = kernel_graph->execution_order(); + common::AnfAlgo::ReorderPosteriorExecList(NOT_NULL(&execution_order)); + kernel_graph->set_execution_order(execution_order); + } + auto ms_context = MsContext::GetInstance(); + MS_EXCEPTION_IF_NULL(ms_context); + // somas + if (runtime::RuntimeConf::GetInstance()->mem_optimize_level() != kOptimizeO0) { + auto somas = std::make_shared(); + bool ret = somas->Assign(kernel_graph); + if (ret) { + MS_LOG(INFO) << "Somas allocate success for graph " << kernel_graph->graph_id() + << " somas size: " << kernel_graph->somas_whole_block_size(); + } else if (somas->IsSupportSomas(*kernel_graph)) { + MS_LOG(WARNING) << "Somas allocate failed for graph " << kernel_graph->graph_id(); + } + } + MS_LOG(INFO) << "Status record: end preprocess before run graph. graph id: " << kernel_graph->graph_id(); +} + +bool CPUKernelExecutor::LaunchKernel(const CNodePtr &kernel, const std::vector &inputs, + const std::vector &workspace, + const std::vector &outputs, KernelMod *kernel_mod, + void * /* stream */) const { + MS_EXCEPTION_IF_NULL(kernel); + + const auto &profiler_inst = profiler::cpu::CPUProfiler::GetInstance(); + MS_EXCEPTION_IF_NULL(profiler_inst); + if (profiler_inst->GetEnableFlag() && profiler_inst->GetOpTimeFlag()) { + auto ret = LaunchKernelWithProfiling(kernel, inputs, workspace, outputs, kernel_mod); + return ret; + } + auto ret = DoLaunchKernel(kernel, inputs, workspace, outputs, kernel_mod); + return ret; +} + +bool CPUKernelExecutor::ExecuteKernelTask(const runtime::KernelTaskType &task_type, + const device::DeviceAddressPtrList &input_addr_list, + const device::DeviceAddressPtrList &output_addr_list, + const size_t &stream_id) const { + auto task_context = + std::make_shared(device_context_, input_addr_list, output_addr_list, nullptr); + auto task = GetTaskByTaskType(task_type, task_context); + MS_EXCEPTION_IF_NULL(task); + + auto ret = task->RunWithRet(); + if (!ret) { + MS_LOG(EXCEPTION) << "Exec task failed, task_type:" << task_type; + } + return ret; +} + +bool CPUKernelExecutor::ExecuteKernelTask(const runtime::KernelTaskType &task_type, + const std::vector &input_addr_list, + const std::vector &output_addr_list, + const size_t &stream_id) const { + if (task_type != runtime::KernelTaskType::kCONTIGUOUS_TASK) { + MS_LOG(EXCEPTION) << "KernelTaskType not supported, task_type:" << task_type; + } + MS_LOG(DEBUG) << "Start Contiguous task"; + + const auto &input_address = input_addr_list[0]; + const auto &output_address = output_addr_list[0]; + const auto &input_storage_info = input_address->GetTensorStorageInfo(); + MS_LOG(DEBUG) << "Input_storage_info:" << (input_storage_info == nullptr ? "" : input_storage_info->ToString()) + << ", input_address size:" << input_address->GetSize() + << ", output_address size:" << output_address->GetSize(); + + MallocMemoryForDeviceAddress(input_address, device_context_); + MallocMemoryForDeviceAddress(output_address, device_context_); + + kernel::ContiguousCpuKernel contiguous_kernel; + auto ret = contiguous_kernel.LaunchContiguous(input_address->type_id(), input_address, input_storage_info, + output_address->type_id(), output_address); + if (!ret) { + MS_LOG(EXCEPTION) << "CpuContiguous failed"; + } + + MS_LOG(DEBUG) << "End Contiguous task"; + return true; +} + +bool CPUKernelExecutor::LaunchKernelWithProfiling(const CNodePtr &kernel, const std::vector &inputs, + const std::vector &workspace, + const std::vector &outputs, + KernelMod *kernel_mod) const { + MS_EXCEPTION_IF_NULL(kernel); + + auto profiler_inst = profiler::cpu::CPUProfiler::GetInstance(); + MS_EXCEPTION_IF_NULL(profiler_inst); + + uint32_t pid = IntToUint(getpid()); + // cpu support multi-thread with mindrt for profiling. + profiler_inst->OpDataProducerBeginParallel(kernel->fullname_with_scope(), pid); + bool ret = DoLaunchKernel(kernel, inputs, workspace, outputs, kernel_mod); + profiler_inst->OpDataProducerEndParallel(kernel->fullname_with_scope()); + profiler_inst->RecordFrameWorkInfo(kernel); + return ret; +} + +bool CPUKernelExecutor::DoLaunchKernel(const CNodePtr &kernel, const std::vector &inputs, + const std::vector &workspace, + const std::vector &outputs, KernelMod *kernel_mod) const { + MS_EXCEPTION_IF_NULL(kernel); + MS_EXCEPTION_IF_NULL(kernel_mod); + uint64_t start_time = 0; + PROFILER_START(start_time); + auto ret = kernel_mod->Launch(inputs, workspace, outputs, nullptr); + PROFILER_END(start_time, runtime::ProfilerModule::kKernel, runtime::ProfilerEvent::kKernelLaunch, + kernel->fullname_with_scope(), false); + return ret; +} + +void CPUKernelExecutor::RebuildKernelSelectBackoffOp(const std::vector &nodes) const { + for (auto &node : nodes) { + MS_EXCEPTION_IF_NULL(node); + if (!AnfAlgo::IsKernelSelectBackoffOp(node)) { + continue; + } + auto [failure_info, failure_type] = AnfAlgo::GetKernelSelectBackoffInfo(node); + if (IsVmapNotSupported(node)) { + MS_EXCEPTION(failure_type) << "#umsg#Kernel select failed:#umsg#" << failure_info; + } + + // Judge whether match strictly between kernel build info and supported kernel attrs. + const auto &kernel_build_info = AnfAlgo::GetSelectKernelBuildInfo(node); + MS_EXCEPTION_IF_NULL(kernel_build_info); + const auto &kernel_attr = kernel::GetKernelAttrFromBuildInfo(kernel_build_info); + const auto &supported_kernel_attrs = + kernel::NativeCpuKernelMod::GetCpuSupportedList(common::AnfAlgo::GetCNodeName(node)); + const auto &match_result = kernel::MatchKernelAttrStrict(kernel_attr, supported_kernel_attrs); + auto attr_info = kernel::FetchPrintInfoByKernelAttr(kernel_attr); + if (!match_result.first) { + MS_LOG(INFO) << "Backoff and rebuild kernel on CPU failed for node: " << node->fullname_with_scope() + << ", node attr: " << attr_info; + MS_EXCEPTION(failure_type) << "#umsg#Kernel select failed:#umsg#" << failure_info; + } else { + // Set the CPU flag. + common::AnfAlgo::SetNodeAttr(kAttrPrimitiveTarget, MakeValue(kCPUDevice), node); + kernel_build_info->set_kernel_type(CPU_KERNEL); + kernel_build_info->set_processor(kernel::Processor::CPU); + MS_LOG(INFO) << "Backoff and rebuild kernel on CPU successfully for node: " << node->fullname_with_scope() + << ", node attr: " << attr_info; + } + + CreateKernel({node}); + } +} + +MS_REGISTER_DEVICE(kCPUDevice, CPUDeviceContext); +#ifdef WITH_BACKEND +MSCONTEXT_REGISTER_INIT_FUNC(kCPUDevice, [](MsContext *ctx) -> void { + MS_EXCEPTION_IF_NULL(ctx); + if (ctx->backend_policy() != "ms") { + (void)ctx->set_backend_policy("ms"); + } +}); +#endif + +// Register functions to _c_expression so python hal module could call CPU device interfaces. +void PybindCPUStatelessFunc(py::module *m) { MS_EXCEPTION_IF_NULL(m); } +REGISTER_DEV_STATELESS_FUNC_CB(kCPUDevice, PybindCPUStatelessFunc); +} // namespace cpu +} // namespace device +} // namespace mindspore diff --git a/inferrt/src/hardware/cpu/cpu_device_context.h b/inferrt/src/hardware/cpu/cpu_device_context.h new file mode 100644 index 0000000000000000000000000000000000000000..6b02af05b46bb41b182a24d0c4d16d349624f363 --- /dev/null +++ b/inferrt/src/hardware/cpu/cpu_device_context.h @@ -0,0 +1,104 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_CCSRC_RUNTIME_HARDWARE_CPU_CPU_DEVICE_CONTEXT_H_ +#define MINDSPORE_CCSRC_RUNTIME_HARDWARE_CPU_CPU_DEVICE_CONTEXT_H_ + +#include +#include +#include +#include +#include +#include "hardware/hardware_abstract/device_context.h" +#include "hardware/hardware_abstract/hardware_abstract/memory_manager.h" +#include "hardware/cpu/res_manager/cpu_res_manager.h" + +namespace mindspore { +namespace device { +namespace cpu { +class CPUKernelExecutor : public KernelExecutor { + public: + CPUKernelExecutor() = default; + ~CPUKernelExecutor() override = default; + + void OptimizeGraph(const FuncGraphPtr &graph) const override; + + void CreateKernel(const std::vector &nodes) const override; + kernel::KernelModPtr CreateKernelMod(const std::string &op_name) const override; + + // Kernel that is not supported by other device can be backed off and rebuilt on the CPU. + // The function will set kernel info and create kernel mod. + void RebuildKernelSelectBackoffOp(const std::vector &nodes) const; + + void PreprocessBeforeRun(const FuncGraphPtr &graph) const override; + + bool LaunchKernel(const CNodePtr &kernel, const std::vector &inputs, + const std::vector &workspace, const std::vector &outputs, + KernelMod *kernel_mod, void * /* stream */) const override; + bool LaunchKernelHP(const CNodePtr &kernel, const std::vector &inputs, + const std::vector &workspace, const std::vector &outputs, + KernelMod *kernel_mod, void *stream) const override { + return LaunchKernel(kernel, inputs, workspace, outputs, kernel_mod, stream); + } + + bool ExecuteKernelTask(const runtime::KernelTaskType &task_type, const device::DeviceAddressPtrList &input_addr_list, + const device::DeviceAddressPtrList &output_addr_list, const size_t &stream_id) const override; + bool ExecuteKernelTask(const runtime::KernelTaskType &task_type, + const std::vector &input_addr_list, + const std::vector &output_addr_list, + const size_t &stream_id) const override; + + std::vector GetLaunchIgnoredInputAddressIdx(const AnfNodePtr &node) const override; + + bool IsLaunchIgnoredInputAddressIdx(const AnfNodePtr &node, size_t input_idx) const override; + + private: + // Select the matching backend kernels according to the data type and format of input and output for all + // execution operators, and set final device data type and format information for backend kernels, device + // data type and format which replace original data type and format will use for executing kernels. + void SetOperatorInfo(const KernelGraphPtr &graph) const; + void SingleOpGraphOptimize(const KernelGraphPtr &graph) const; + void OptimizeGraphImpl(const KernelGraphPtr &graph) const; + void OptimizeMindIR(const KernelGraphPtr &graph) const; + // Launch a kernel and record the elapsed time end to end. + bool LaunchKernelWithProfiling(const CNodePtr &kernel, const std::vector &inputs, + const std::vector &workspace, + const std::vector &outputs, KernelMod *kernel_mod) const; + // Launch a kernel by 'KernelMod' of the kernel. + bool DoLaunchKernel(const CNodePtr &kernel, const std::vector &inputs, + const std::vector &workspace, const std::vector &outputs, + KernelMod *kernel_mod) const; + void UpdateKernelRefInfo(const KernelGraphPtr &graph) const; + + mutable std::mutex launch_mutex_; +}; + +class CPUDeviceContext : public DeviceInterface { + public: + explicit CPUDeviceContext(const DeviceContextKey &device_context_key) : DeviceInterface(device_context_key) {} + ~CPUDeviceContext() override = default; + + void Initialize() override; + + void Destroy() override; + + private: + DISABLE_COPY_AND_ASSIGN(CPUDeviceContext); +}; +} // namespace cpu +} // namespace device +} // namespace mindspore + +#endif // MINDSPORE_CCSRC_RUNTIME_HARDWARE_CPU_CPU_DEVICE_CONTEXT_H_ diff --git a/inferrt/src/hardware/cpu/res_manager/cpu_res_manager.cc b/inferrt/src/hardware/cpu/res_manager/cpu_res_manager.cc new file mode 100644 index 0000000000000000000000000000000000000000..985b55ac5ddddd1b4a827c1f0128b2bbff8f327b --- /dev/null +++ b/inferrt/src/hardware/cpu/res_manager/cpu_res_manager.cc @@ -0,0 +1,130 @@ +/** + * Copyright 2025 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "hardware/cpu/res_manager/cpu_res_manager.h" +#include +#include +#include +#include +#include +#include "runtime/hardware_abstract/memory_manager.h" + +namespace mindspore { +namespace device { +namespace cpu { +void CPUResManager::Initialize() { + mem_manager_ = std::make_shared(); + MS_EXCEPTION_IF_NULL(mem_manager_); +} + +void CPUResManager::Destroy() { + // Release memory. + if (mem_manager_ != nullptr) { + mem_manager_->Finalize(); + mem_manager_ = nullptr; + } +} + +void *CPUResManager::AllocateMemory(size_t size, uint32_t stream_id) const { + MS_EXCEPTION_IF_NULL(mem_manager_); + return mem_manager_->MallocMemFromMemPool(size, false, false, stream_id); +} + +void CPUResManager::FreeMemory(void *ptr) const { + MS_EXCEPTION_IF_NULL(ptr); + MS_EXCEPTION_IF_NULL(mem_manager_); + mem_manager_->FreeMemFromMemPool(ptr); +} + +void CPUResManager::FreePartMemorys(const std::vector &free_addrs, const std::vector &keep_addrs, + const std::vector &keep_addr_sizes) const { + CPUMemoryPool::GetInstance().FreePartTensorMems(free_addrs, keep_addrs, keep_addr_sizes); +} + +std::vector CPUResManager::AllocateContinuousMemory(const std::vector &size_list, + uint32_t stream_id) const { + MS_EXCEPTION_IF_NULL(mem_manager_); + return mem_manager_->MallocContinuousMemFromMemPool(size_list, stream_id); +} + +namespace { + +// clang-format off +#define FOR_EACH_TYPE_BASE(M) \ + M(kNumberTypeBool, bool) \ + M(kNumberTypeUInt8, uint8_t) \ + M(kNumberTypeInt4, int8_t) \ + M(kNumberTypeInt8, int8_t) \ + M(kNumberTypeInt16, int16_t) \ + M(kNumberTypeInt32, int32_t) \ + M(kNumberTypeInt64, int64_t) \ + M(kNumberTypeUInt16, uint16_t) \ + M(kNumberTypeUInt32, uint32_t) \ + M(kNumberTypeUInt64, uint64_t) \ + M(kNumberTypeFloat16, float16) \ + M(kNumberTypeFloat32, float) \ + M(kNumberTypeFloat64, double) \ + M(kNumberTypeFloat8E4M3FN, float8_e4m3fn) \ + M(kNumberTypeFloat8E5M2, float8_e5m2) \ + M(kNumberTypeHiFloat8, hifloat8) \ + M(kNumberTypeComplex64, ComplexStorage) \ + M(kNumberTypeComplex128, ComplexStorage) + +#ifndef KERNEL_EXECUTOR_ANDROID +#define FOR_EACH_TYPE_EXTRA(M) M(kNumberTypeBFloat16, bfloat16) +#else +#define FOR_EACH_TYPE_EXTRA(M) +#endif + +#define FOR_EACH_TYPE(M) \ + FOR_EACH_TYPE_BASE(M) \ + FOR_EACH_TYPE_EXTRA(M) + +#define REGISTER_SIZE(address_type_id, address_type) { address_type_id, sizeof(address_type) }, + +static const std::unordered_map kTypeSizeMap = { + FOR_EACH_TYPE(REGISTER_SIZE) +}; + +size_t GetTypeSize(TypeId tid) { + return kTypeSizeMap.at(tid); +} + +#undef FOR_EACH_TYPE +#undef FOR_EACH_TYPE_BASE +#undef FOR_EACH_TYPE_EXTRA +#undef REGISTER_SIZE +// clang-format on +} // namespace + + +bool CPUResManager::Copy(void *dst, const void *src, uint64_t size, CopyType kind, size_t stream_id) const { + if (size == 0) { + return true; + } + MS_EXCEPTION_IF_NULL(dst); + MS_EXCEPTION_IF_NULL(src); + auto ret_code = memcpy_s(dst, size, src, size); + if (ret_code == ERANGE) { + ConvertSameType(dst, src, size, kNumberTypeUInt8); + } else if (ret_code != EOK) { + MS_LOG(ERROR) << "Failed to copy tensor from ptr:" << src << " to :" << dst << " size:" << size; + return false; + } + return true; +} +} // namespace cpu +} // namespace device +} // namespace mindspore diff --git a/inferrt/src/hardware/cpu/res_manager/cpu_res_manager.h b/inferrt/src/hardware/cpu/res_manager/cpu_res_manager.h new file mode 100644 index 0000000000000000000000000000000000000000..9330756d3a7f149e0ac476c260cc5b5aa93a6523 --- /dev/null +++ b/inferrt/src/hardware/cpu/res_manager/cpu_res_manager.h @@ -0,0 +1,55 @@ +/** + * Copyright 2025 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_CCSR_PLUGIN_RES_MANAGER_CPU_CPU_RES_MANAGER_H_ +#define MINDSPORE_CCSR_PLUGIN_RES_MANAGER_CPU_CPU_RES_MANAGER_H_ +#include +#include +#include +#include +#include "hardware/hardware_abstract/device_context.h" +#include "hardware/cpu/res_manager/mem_manager/cpu_memory_manager.h" + +namespace mindspore { +namespace device { +namespace cpu { +class CPUResManager : public DeviceResManager { + public: + CPUResManager() { Initialize(); } + ~CPUResManager() override = default; + + void Initialize() override; + + void Destroy() override; + + std::vector AllocateContinuousMemory(const std::vector &size_list, + uint32_t stream_id = kDefaultStreamIndex) const override; + + bool Copy(void *dst, const void *src, uint64_t size, CopyType kind, size_t stream_id) const override; + + + // Relevant function to allocate and free device memory of raw ptr. + void *AllocateMemory(size_t size, uint32_t stream_id = kDefaultStreamIndex) const override; + void FreeMemory(void *ptr) const override; + void FreePartMemorys(const std::vector &free_addrs, const std::vector &keep_addrs, + const std::vector &keep_addr_sizes) const override; + + private: + std::shared_ptr mem_manager_{nullptr}; +}; +} // namespace cpu +} // namespace device +} // namespace mindspore +#endif diff --git a/inferrt/src/hardware/cpu/res_manager/mem_manager/cpu_memory_manager.cc b/inferrt/src/hardware/cpu/res_manager/mem_manager/cpu_memory_manager.cc new file mode 100644 index 0000000000000000000000000000000000000000..727d24f694a9a836b7b0e54593845dc021c8e2ba --- /dev/null +++ b/inferrt/src/hardware/cpu/res_manager/mem_manager/cpu_memory_manager.cc @@ -0,0 +1,110 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "plugin/cpu/res_manager/mem_manager/cpu_memory_manager.h" + +namespace mindspore { +namespace device { +namespace cpu { +uint8_t *CPUMemoryManager::MemMalloc(size_t size) { + auto block = std::make_shared>(); + try { + block->resize(size, 0); + auto ptr = block->data(); + mem_block_map_[ptr] = block; + return ptr; + } catch (const std::exception &e) { + MS_LOG(EXCEPTION) << "Malloc memory failed: size " << size; + } +} + +uint8_t *CPUMemoryManager::MallocStaticMem(size_t size, bool, uint32_t) { + auto ptr = MemMalloc(size); + static_mem_[ptr] = size; + return ptr; +} + +uint8_t *CPUMemoryManager::MallocDynamicMem(size_t size, bool) { + void *ptr = nullptr; + size_t min_size = 0; + // first find the smallest cached_mem_ which fits the size + for (auto &&iter : cached_mem_) { + if (iter.second >= size) { + if (min_size == 0 || iter.second < min_size) { + ptr = iter.first; + min_size = iter.second; + } + } + } + if (ptr != nullptr) { + if (memset_s(ptr, size, 0, size) != EOK) { + free(ptr); + MS_LOG(EXCEPTION) << "Failed to init memory."; + } + dynamic_mem_[ptr] = min_size; + (void)cached_mem_.erase(ptr); + return reinterpret_cast(ptr); + } + // if not found, malloc + auto new_ptr = MemMalloc(size); + dynamic_mem_[new_ptr] = size; + return new_ptr; +} + +void CPUMemoryManager::ResetDynamicMemory() { + // don't free, for multi graph + for (auto &&iter : dynamic_mem_) { + cached_mem_[iter.first] = iter.second; + } + dynamic_mem_.clear(); +} + +CPUMemoryManager::~CPUMemoryManager() { MemFree(); } + +void CPUMemoryManager::MemFree() noexcept { + if (mem_ptr_ != nullptr) { + mem_ptr_ = nullptr; + mem_size_ = 0; + } + static_mem_.clear(); + dynamic_mem_.clear(); + cached_mem_.clear(); + mem_block_map_.clear(); +} + +void *CPUMemoryManager::StaticMemMalloc(size_t mem_size) { + auto ptr = MemMalloc(mem_size); + if (ptr != nullptr) { + static_mem_[ptr] = mem_size; + return ptr; + } else { + MS_LOG(EXCEPTION) << "Malloc memory failed: size " << mem_size; + } +} + +void CPUMemoryManager::MemFree(void *ptr) { + auto iter = static_mem_.find(ptr); + if (iter != static_mem_.end()) { + (void)static_mem_.erase(iter); + auto block_iter = mem_block_map_.find(ptr); + if (block_iter != mem_block_map_.end()) { + (void)mem_block_map_.erase(block_iter); + } + } +} +} // namespace cpu +} // namespace device +} // namespace mindspore diff --git a/inferrt/src/hardware/cpu/res_manager/mem_manager/cpu_memory_manager.h b/inferrt/src/hardware/cpu/res_manager/mem_manager/cpu_memory_manager.h new file mode 100644 index 0000000000000000000000000000000000000000..1f55d4f1ca7d4364bb9ad7064ed833a34ec30007 --- /dev/null +++ b/inferrt/src/hardware/cpu/res_manager/mem_manager/cpu_memory_manager.h @@ -0,0 +1,79 @@ +/** + * Copyright 2025 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_RES_MANAGER_CPU_CPU_MEM_MANAGER_CPU_MEMORY_MANAGER_H_ +#define MINDSPORE_CCSRC_PLUGIN_RES_MANAGER_CPU_CPU_MEM_MANAGER_CPU_MEMORY_MANAGER_H_ +#include +#include +#include +#include "hardware/hardware_abstract/visible.h" +#include "hardware/hardware_abstract/memory_manager.h" +#include "hardware/cpu/res_manager/mem_manager/cpu_memory_pool.h" + +namespace mindspore { +namespace device { +namespace cpu { +class BACKEND_EXPORT CPUMemoryManager : public MemoryManager { + public: + CPUMemoryManager() = default; + virtual ~CPUMemoryManager(); + + void Initialize() override {} + void Finalize() override { CPUMemoryPool::GetInstance().ReleaseDeviceRes(); } + void ResetDynamicMemory() override; + + void *StaticMemMalloc(size_t mem_size); + void MemFree(void *ptr); + + void *MallocMemFromMemPool(size_t size, bool from_persistent_mem, bool need_recycle = false, + uint32_t stream_id = kDefaultStreamIndex) override { + return CPUMemoryPool::GetInstance().AllocTensorMem(size, from_persistent_mem, false, stream_id); + } + void FreeMemFromMemPool(void *device_ptr) override { CPUMemoryPool::GetInstance().FreeTensorMem(device_ptr); } + std::vector MallocContinuousMemFromMemPool(const std::vector &size_list, + uint32_t stream_id = kDefaultStreamIndex) override { + return CPUMemoryPool::GetInstance().AllocContinuousTensorMem(size_list, stream_id); + } + + DynamicMemPool *GetMemoryPool() override { + if (MS_UNLIKELY(memory_pool_ == nullptr)) { + memory_pool_ = &(CPUMemoryPool::GetInstance()); + } + return memory_pool_; + } + + bool GetDynamicMalloc() { return dynamic_malloc_; } + + protected: + uint8_t *MallocStaticMem(size_t size, bool communication_mem, uint32_t graph_id) override; + uint8_t *MallocDynamicMem(size_t size, bool communication_mem) override; + + private: + uint8_t *MemMalloc(size_t size); + void MemFree() noexcept; + + size_t mem_size_{0}; + uint8_t *mem_ptr_{nullptr}; + bool dynamic_malloc_{false}; + std::map dynamic_mem_; + std::map static_mem_; + std::map cached_mem_; + std::map>> mem_block_map_; +}; +} // namespace cpu +} // namespace device +} // namespace mindspore +#endif // MINDSPORE_CCSRC_PLUGIN_RES_MANAGER_CPU_CPU_MEM_MANAGER_CPU_MEMORY_MANAGER_H_ diff --git a/inferrt/src/hardware/cpu/res_manager/mem_manager/cpu_memory_pool.cc b/inferrt/src/hardware/cpu/res_manager/mem_manager/cpu_memory_pool.cc new file mode 100644 index 0000000000000000000000000000000000000000..78482d105bb4127257204f7202be955dba40f809 --- /dev/null +++ b/inferrt/src/hardware/cpu/res_manager/mem_manager/cpu_memory_pool.cc @@ -0,0 +1,86 @@ +/** + * Copyright 2021-2025 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "plugin/cpu/res_manager/mem_manager/cpu_memory_pool.h" + + +namespace mindspore { +namespace device { +namespace cpu { +namespace { +const char kMemAvailable[] = "MemAvailable"; +} + +CPUMemoryPool &CPUMemoryPool::GetInstance() { + static CPUMemoryPool instance; + static std::once_flag flag; + std::call_once(flag, [&]() { + float init_size = runtime::RuntimeConf::GetInstance()->mem_init_size(); + size_t init_size_byte = FloatToSize(init_size * kGBToByte); + float increase_size = runtime::RuntimeConf::GetInstance()->mem_block_increase_size(); + size_t increase_size_byte = FloatToSize(increase_size * kGBToByte); + float max_size = runtime::RuntimeConf::GetInstance()->mem_max_size(); + size_t max_size_byte = FloatToSize(max_size * kGBToByte); + instance.Initialize(init_size_byte, increase_size_byte, max_size_byte); +#ifdef ENABLE_DEBUGGER + // Set memory profiler callback func. + instance.SetMemoryProfilerCallback([&]() { + static auto profiler_inst = profiler::Profiler::GetInstance(kCPUDevice); + MS_EXCEPTION_IF_NULL(profiler_inst); + if (profiler_inst->GetEnableFlag() && profiler_inst->GetProfileMemoryFlag()) { + profiler_inst->RecordMemoryPoolInfo(instance.TotalUsedMemStatistics(), instance.TotalMemStatistics(), + instance.TotalUsedByEventMemStatistics()); + } + }); +#endif + + instance.SetRankIdGetter([]() { + size_t rank_id = SIZE_MAX; + if (DistributedMeta::GetInstance()->initialized()) { + rank_id = DistributedMeta::GetInstance()->global_rank_id(); + } + return rank_id; + }); + }); + return instance; +} + +size_t CPUMemoryPool::AllocDeviceMem(size_t alloc_size, DeviceMemPtr *addr) { + if (alloc_size == 0) { + MS_LOG(EXCEPTION) << "The memory alloc size is 0."; + } + + *addr = malloc(alloc_size); + if (*addr == nullptr) { + MS_LOG(ERROR) << "malloc memory failed."; + return 0; + } + + total_used_memory_ += alloc_size; + MS_LOG(INFO) << "Current alloc size[" << alloc_size << "], total used size[" << total_used_memory_ << "]."; + + return alloc_size; +} + +bool CPUMemoryPool::FreeDeviceMem(const DeviceMemPtr &addr) { + free(addr); + return true; +} + +size_t CPUMemoryPool::free_mem_size() { return mindspore::GetSystemMemorySize(kMemAvailable); } +} // namespace cpu +} // namespace device +} // namespace mindspore diff --git a/inferrt/src/hardware/cpu/res_manager/mem_manager/cpu_memory_pool.h b/inferrt/src/hardware/cpu/res_manager/mem_manager/cpu_memory_pool.h new file mode 100644 index 0000000000000000000000000000000000000000..fb33d652188096a30a2ebcf491375ad98e6239a1 --- /dev/null +++ b/inferrt/src/hardware/cpu/res_manager/mem_manager/cpu_memory_pool.h @@ -0,0 +1,50 @@ +/** + * Copyright 2025 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_PLUGIN_RES_MANAGER_CPU_CPU_MEM_MANAGER_CPU_MEMORY_POOL_H_ +#define MINDSPORE_CCSRC_PLUGIN_RES_MANAGER_CPU_CPU_MEM_MANAGER_CPU_MEMORY_POOL_H_ + +#include +#include +#include + +#include "hardware/hardware_abstract/visible.h" + +namespace mindspore { +namespace device { +namespace cpu { +class HARDWARE_EXPORT CPUMemoryPool : public DynamicMemPoolBestFit { + public: + ~CPUMemoryPool() override = default; + + static CPUMemoryPool &GetInstance(); + + size_t AllocDeviceMem(size_t size, DeviceMemPtr *addr) override; + bool FreeDeviceMem(const DeviceMemPtr &addr) override; + size_t free_mem_size() override; + std::string GetMemoryPoolType() const override { return "CPU"; } + + private: + CPUMemoryPool() = default; + DISABLE_COPY_AND_ASSIGN(CPUMemoryPool); + + size_t total_used_memory_{0}; +}; +} // namespace cpu +} // namespace device +} // namespace mindspore + +#endif // MINDSPORE_CCSRC_PLUGIN_RES_MANAGER_CPU_CPU_MEM_MANAGER_CPU_MEMORY_POOL_H_ diff --git a/inferrt/src/hardware/hardware_abstract/CMakeLists.txt b/inferrt/src/hardware/hardware_abstract/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..41cc82cbcafc141b12261add719e19d3a29ad993 --- /dev/null +++ b/inferrt/src/hardware/hardware_abstract/CMakeLists.txt @@ -0,0 +1,41 @@ +check_debug_log_out() + +find_package(Python3 COMPONENTS Interpreter Development REQUIRED) +message(STATUS "Python3_FOUND: ${Python3_FOUND}") +message(STATUS "Python3_INCLUDE_DIRS: ${Python3_INCLUDE_DIRS}") +include_directories(${Python3_INCLUDE_DIRS}) + +if(TARGET pybind11::headers) + message("-- pybind11 already included") +else() + # Prepare pybind11 module + message("-- pybind11 not included, start including") + set(depname "pybind11") + set(PYBIND11_PATH "${PROJECT_SOURCE_DIR}/${depname}-src") + message("-- PYBIND11_PATH: ${PYBIND11_PATH}") + + # Download and copy pybind11 project if not exists + if(NOT EXISTS ${PYBIND11_PATH}) + message("-- Downloading ${depname} module...") + include(FetchContent) + FetchContent_Declare( + ${depname} + # Change github repo to gitee's: https://github.com/pybind/pybind11 ==> https://gitee.com/mirrors/pybind11 + GIT_REPOSITORY https://gitee.com/mirrors/pybind11.git + GIT_TAG 58c382a8e3d7081364d2f5c62e7f429f0412743b # stable + ) + FetchContent_MakeAvailable(${depname}) + message("-- pybind11_SOURCE_DIR: ${${depname}_SOURCE_DIR}") + message("-- pybind11_BINARY_DIR: ${${depname}_BINARY_DIR}") + # Find pybind11 package location, or call find_package(pybind11 REQUIRED) + message("-- Copying ${${depname}_SOURCE_DIR} to ${PROJECT_SOURCE_DIR}/...") + file(COPY ${${depname}_SOURCE_DIR} DESTINATION ${PROJECT_SOURCE_DIR}) + endif() + + # Include pybind11 directories + include_directories(${PYBIND11_PATH}/include) + +endif() + +file(GLOB_RECURSE HARDWARE_ABSTRACT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc") +add_library(hardware_abstract_obj STATIC ${HARDWARE_ABSTRACT_SRC_FILES}) \ No newline at end of file diff --git a/inferrt/src/hardware/hardware_abstract/common.cc b/inferrt/src/hardware/hardware_abstract/common.cc new file mode 100644 index 0000000000000000000000000000000000000000..31b849606fb1edc50cef850ab8ac8b43189c4b6b --- /dev/null +++ b/inferrt/src/hardware/hardware_abstract/common.cc @@ -0,0 +1,27 @@ +/** + * Copyright 2025 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "hardware/hardware_abstract/common.h" + +namespace mindspore { +GilReleaseWithCheck::GilReleaseWithCheck() { + if (Py_IsInitialized() != 0 && PyGILState_Check() != 0) { + release_ = std::make_unique(); + } +} + +GilReleaseWithCheck::~GilReleaseWithCheck() { release_ = nullptr; } +} // namespace mindspore diff --git a/inferrt/src/hardware/hardware_abstract/common.h b/inferrt/src/hardware/hardware_abstract/common.h new file mode 100644 index 0000000000000000000000000000000000000000..8fa280efd0c3f96736f2e4ceeeddf3504703607f --- /dev/null +++ b/inferrt/src/hardware/hardware_abstract/common.h @@ -0,0 +1,35 @@ +/** + * Copyright 2025 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef HARDWARE_COMMON_H__ +#define HARDWARE_COMMON_H__ + +#include "pybind11/pybind11.h" + +namespace py = pybind11; +namespace mindspore { +class GilReleaseWithCheck { + public: + GilReleaseWithCheck(); + + ~GilReleaseWithCheck(); + + private: + std::unique_ptr release_; +}; +} // namespace mindspore + +#endif // HARDWARE_COMMON_H__ diff --git a/inferrt/src/hardware/hardware_abstract/device_context.cc b/inferrt/src/hardware/hardware_abstract/device_context.cc new file mode 100644 index 0000000000000000000000000000000000000000..041913e78e632873838321afd7705d1fa831c856 --- /dev/null +++ b/inferrt/src/hardware/hardware_abstract/device_context.cc @@ -0,0 +1,29 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "hardware/hardware_abstract/device_context.h" + +namespace mindspore { +namespace device { +DeviceResManager::DeviceResManager() { + device_context_ = nullptr; +} + +bool DeviceContext::initialized() const { + return initialized_; +} +} // namespace device +} // namespace mindspore diff --git a/inferrt/src/hardware/hardware_abstract/device_context.h b/inferrt/src/hardware/hardware_abstract/device_context.h new file mode 100644 index 0000000000000000000000000000000000000000..de1e9f685e24a4a13f93ed6f02f32262711567f0 --- /dev/null +++ b/inferrt/src/hardware/hardware_abstract/device_context.h @@ -0,0 +1,326 @@ +/** + * Copyright 2025 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef INFERRT_SRC_HARDWARE_DEVICE_CONTEXT_H_ +#define INFERRT_SRC_HARDWARE_DEVICE_CONTEXT_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "common/common.h" +#include "hardware/hardware_abstract/visible.h" +#include "hardware/hardware_abstract/stream_util.h" +#ifdef __APPLE__ +#include "async/spinlock.h" +#endif + +namespace mindspore { +class DeviceEvent; +using DeviceEventPtr = std::shared_ptr; +namespace runtime { +enum class KernelTaskType; +} +namespace device { +constexpr size_t kSizeZero = 0; + +struct DeviceContextKey { + // device type name, such as 'GPU' 'Ascend' 'CPU'. + std::string device_name_; + uint32_t device_id_{0}; + + // Use the result of ToString() as key to look up DeviceContext + // in cache map which maintains created DeviceContext objects. + std::string ToString() const { return device_name_ + "_" + std::to_string(device_id_); } +}; + +class DeviceResManager; +class KernelExecutor; + +// DeviceContext is unified interface of interaction with device. +class HARDWARE_EXPORT DeviceContext { + public: + explicit DeviceContext(const DeviceContextKey &device_context_key) + : device_context_key_(device_context_key), initialized_(false) {} + virtual ~DeviceContext() = default; + + // Initialize the device context. + virtual void Initialize() = 0; + + // Destroy device context and release device resource. + virtual void Destroy() = 0; + + // Get device_context_key_ to obtain device name and device id. + const DeviceContextKey &device_context_key() const { return device_context_key_; } + + // Get kernel executor. + std::shared_ptr GetKernelExecutor() const { return kernel_executor_; } + + void SetKernelExecutor(const std::shared_ptr &kernel_executor) { kernel_executor_ = kernel_executor; } + + // Return whether this device context is initialized. + bool initialized() const; + + DeviceContextKey device_context_key_; + std::unique_ptr device_res_manager_; + + protected: +#ifdef __APPLE__ + // There are some problems with using mutex on Mac, use spinlocks instead. + inline static SpinLock init_lock_; +#else + inline static std::mutex init_mutex_; +#endif + bool initialized_; + + private: + std::shared_ptr kernel_executor_; +}; +using DeviceContextPtr = std::shared_ptr; +class MemoryManager; +class CollectiveCommunicationLib; +class OffloadedMemPool; +using DeviceMemPtr = void *; + +class HARDWARE_EXPORT DeviceResManager { + public: + DeviceResManager(); + + virtual ~DeviceResManager() = default; + + // Initialize the device resource manager. + virtual void Initialize() {} + + virtual void SetAclDeterministic() {} + + // Destroy device resource manager and release device resource. + virtual void Destroy() {} + + // Bind device to current thread to gain device control privileges + // If force_bind is true, bind context to current thread every time; + // Otherwise, only bind context to current thread for the first time. + virtual bool BindDeviceToCurrentThread(bool force_bind) const { return true; } + virtual void ResetStreamAndCtx() const {} + + // Relevant function to allocate and free device memory of raw ptr. + virtual void *AllocateMemory(size_t size, uint32_t stream_id = kDefaultStreamIndex) const = 0; + virtual void FreeMemory(void *ptr) const = 0; + virtual void FreePartMemorys(const std::vector &free_addrs, const std::vector &keep_addrs, + const std::vector &keep_addr_sizes) const = 0; + virtual void DefragMemory() {} + virtual bool IsEnableVmm() const { return false; } + + // Interface for multi stream event control. + virtual bool RecordEvent(int64_t task_id_on_stream, uint32_t user_stream_id, + const std::vector> &memory_stream_addresses, + const DeviceEventPtr &input_event) { + return false; + } + + virtual bool WaitEvent(int64_t task_id_on_stream, uint32_t user_stream_id, uint32_t memory_stream_id) { + return false; + } + + virtual bool WaitEvent(int64_t task_id_on_stream, uint32_t user_stream_id) { return false; } + + virtual bool SyncAllEvents() { return false; } + + virtual size_t GetMaxUsedMemorySize() const { return 0; } + + // Relevant function to manage memory statistics + virtual size_t GetTotalMemStatistics() const { return 0; } + virtual size_t GetTotalUsedMemStatistics() const { return 0; } + virtual size_t GetTotalIdleMemStatistics() const { return 0; } + virtual size_t GetTotalEagerFreeMemStatistics() const { return 0; } + virtual size_t GetUsedMemPeakStatistics() const { return 0; } + virtual size_t GetReservedMemPeakStatistics() const { return 0; } + virtual std::unordered_map GetBlockCountsStatistics() const { return {}; } + virtual std::unordered_map GetBlockUnitSizeStatistics() const { return {}; } + virtual std::unordered_map> + GetCommonMemBlocksInfoStatistics() const { + return {}; + } + virtual std::unordered_map> + GetPersistentMemBlocksInfoStatistics() const { + return {}; + } + virtual void ResetMaxMemoryReserved() {} + virtual void ResetMaxMemoryAllocated() {} + + virtual size_t EmptyCache() { return -1L; } + + // Allocate host memory with raii and ref count + virtual std::shared_ptr AllocateHostMemory(size_t size) const { + return std::shared_ptr(::malloc(size), ::free); + } + virtual size_t GetAvailableMemSize() const { return 0; } + + // Allocate continuous device memory according to size list. + // Communication operators may need continuous memory for input and output + // to optimize the communication performance. + virtual std::vector AllocateContinuousMemory(const std::vector &size_list, + uint32_t stream_id = kDefaultStreamIndex) const { + LOG_ERROR << "Unimplemented interface."; + return {}; + } + + // Create a stream with assigning a stream id, the assigned stream id will be written to the parameter '*stream_id'. + virtual bool CreateStream(size_t *stream_id) const { + LOG_ERROR << "Unimplemented interface: 'CreateStream'."; + *stream_id = kSizeZero; + return false; + } + + // Create a stream with priority. + virtual bool CreateStreamWithPriority(size_t *stream_id, int32_t priority) const { + *stream_id = kSizeZero; + return false; + } + + virtual size_t QueryStreamSize() const { return 0L; } + virtual std::vector GetStreamIds() const { return {}; } + + // If multi-stream used in pynative mode, other streams must be sync before the graph + // is executed. Otherwise, out-of-order occurs. Therefore this flag is added. + // This solution is a temporary solution, this flag will be removed after multi-stream is + // supported in graph mode. + virtual bool single_op_multi_stream_enable() const { return false; } + virtual void set_single_op_multi_stream_enable(bool single_op_multi_stream_enable) {} + + // Get the stream pointer by stream_id. + virtual void *GetStream(size_t stream_id) const { return nullptr; } + + // Set currently using stream id. + virtual void SetCurrentStreamId(size_t stream_id) { return; } + + // Get currently using stream id. + virtual size_t GetCurrentStreamId() const { return kSizeZero; } + + virtual void *GetStream() const { return nullptr; } + + virtual size_t GetCommunicationStreamID() const { return kDefaultStreamIndex; } + + virtual size_t GetCommunicationStreamIDByGroup(const std::string &group) const { return GetCommunicationStreamID(); } + + // Destroy a stream bound to the input parameter "stream_id". + virtual bool DestroyStream(size_t stream_id) const { return false; } + + // Query tasks' completion status of a stream. + virtual bool QueryStream(size_t stream_id) const { return true; } + + // Synchronize stream, device such as GPU and Ascend need stream to launch kernel asynchronously, + // Using 'SyncStream' to block thread and wait for completing all tasks on specific stream. + // Using 'SyncAllStream' to block thread and wait for completing all tasks on all streams. + // Devices without stream could ignore the implementation of these function. + // Since the current entry for creating streams is not unified, the implementation of the 'SyncStream' and + // "SyncAllStreams" interfaces are implemented by subclasses. + virtual bool SyncStream(size_t stream_id) const { return true; } + + // 'sync_device' is used for Ascend backend. + virtual bool SyncAllStreams(bool sync_device = true) const { return true; } + + virtual bool SyncNotDefaultStreams() const { return true; } + + // Return default stream id. Normally it's 0. + virtual size_t DefaultStream() const { return 0; } + + // Create device event for runtime. + virtual DeviceEventPtr CreateRuntimeEvent(bool enable_blocking, bool enable_record_wait) { return nullptr; } + + // Create device event with flag. + virtual DeviceEventPtr CreateEventWithFlag(bool enable_timing, bool blocking, bool use_extensional_api = true) { + return nullptr; + } + + // Destroy specified device event. + virtual bool DestroyEvent(const DeviceEventPtr &event) { return true; } + + // Destroy all device events. + virtual bool DestroyAllEvents() { return true; } + + virtual std::shared_ptr mem_manager() const { return nullptr; } + + virtual bool LaunchCallback(std::function callback_func, size_t stream_id, bool is_block = false) const { + callback_func(); + return true; + } + + protected: + DeviceContext *device_context_{nullptr}; + + private: + template + friend class DeviceInterface; + void SetDeviceContext(DeviceContext *device_context) { device_context_ = device_context; } + std::shared_ptr offloaded_mem_pool_; +}; + +using CallbackFunc = std::function; + +class HARDWARE_EXPORT KernelExecutor { + public: + virtual ~KernelExecutor() = default; + + virtual void Initialize() {} + virtual void Destroy() {} + + void SetDeviceContext(DeviceContext *device_context) { device_context_ = device_context; } + + protected: + DeviceContext *device_context_{nullptr}; +}; + +template +class DeviceInterface : public DeviceContext {}; + +template <> +class DeviceInterface<> : public DeviceContext { + public: + explicit DeviceInterface(const DeviceContextKey &key) : DeviceContext(key) {} + + protected: + void CheckUnset(const void *ptr, const std::string &error_msg) const { + if (ptr != nullptr) { + LOG_ERROR << error_msg; + } + } +}; + +template +class DeviceInterface : public DeviceInterface { + public: + explicit DeviceInterface(const DeviceContextKey &key) : DeviceInterface(key) { + if constexpr (std::is_base_of_v) { + DeviceInterface::CheckUnset(reinterpret_cast(DeviceContext::device_res_manager_.get()), + "DeviceResManager has been registered!"); + DeviceContext::device_res_manager_ = std::make_unique(); + DeviceContext::device_res_manager_->SetDeviceContext(this); + } else if constexpr (std::is_base_of_v) { + DeviceInterface::CheckUnset(reinterpret_cast(DeviceContext::GetKernelExecutor().get()), + "KernelExecutor has been registered!"); + DeviceContext::SetKernelExecutor(std::make_shared()); + DeviceContext::GetKernelExecutor()->SetDeviceContext(this); + } + } +}; +} // namespace device +} // namespace mindspore +#endif // INFERRT_SRC_HARDWARE_DEVICE_CONTEXT_H_ diff --git a/inferrt/src/hardware/hardware_abstract/device_context_manager.cc b/inferrt/src/hardware/hardware_abstract/device_context_manager.cc new file mode 100644 index 0000000000000000000000000000000000000000..f53cb3d283fcb323cc8e07669c61020bea793f6d --- /dev/null +++ b/inferrt/src/hardware/hardware_abstract/device_context_manager.cc @@ -0,0 +1,191 @@ +/** + * Copyright 2021-2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "hardware/hardware_abstract/device_context_manager.h" +#if defined(_WIN32) || defined(_WIN64) +#include +#endif +#ifdef __linux__ +#include +#endif // #ifdef __linux__ +#include +#include +#include +#include +#include +#include "hardware/hardware_abstract/dlopen_macro.h" +#include "hardware/hardware_abstract/multi_stream_controller.h" +#include "common/logger.h" + +namespace mindspore { +namespace { +size_t constexpr GetStrLen(const char *const str) { + if (*str == '\0') { + return 0; + } else { + return GetStrLen(str + 1) + 1; + } +} + +constexpr auto kCudaHomeEnv = "CUDA_HOME"; +constexpr auto kNvccVersionKeyWords = "Cuda compilation tools, release "; +constexpr size_t kNvccVersionKeyWordsSize = GetStrLen(kNvccVersionKeyWords); +constexpr auto kSuccessKeyWord = "Success"; +constexpr size_t kSuccessKeyWordSize = GetStrLen(kSuccessKeyWord); +constexpr size_t kBufferSize = 999; +constexpr auto kGpuPluginName = "libmindspore_gpu"; +#if defined(_WIN32) +constexpr bool kIsWindowsPlatform = true; +#else +constexpr bool kIsWindowsPlatform = false; +#endif +} // namespace +namespace device { + +DeviceContextManager &DeviceContextManager::GetInstance() { + static DeviceContextManager instance{}; +#ifdef WITH_BACKEND + instance.LoadPlugin(); +#endif + return instance; +} + +void DeviceContextManager::Register(const std::string &device_name, DeviceContextCreator &&device_context_creator) { + LOG_OUT << "Register device context creator for device: " << device_name; + if (device_context_creators_.find(device_name) == device_context_creators_.end()) { + (void)device_context_creators_.emplace(device_name, device_context_creator); + } +} + +void DeviceContextManager::ClearDeviceContexts() { + multi_stream_controllers_.clear(); + for (auto &iter : device_contexts_) { + LOG_OUT << "Release device " << iter.first; + if (iter.second == nullptr) { + LOG_ERROR << "device context is null"; + } + iter.second->Destroy(); + } + backend_to_device_context_.clear(); + device_contexts_.clear(); +} + +void DeviceContextManager::ChildAfterFork() { + LOG_OUT << "DeviceContextManager reinitialize after fork."; + LOG_OUT << "Clear device_contexts_."; + device_contexts_.clear(); + LOG_OUT << "DeviceContextManager reinitialize after fork done."; +} + +void DeviceContextManager::BindDeviceCtx() const { + for (auto &iter : device_contexts_) { + if (iter.second == nullptr) { + LOG_ERROR << "device context is null"; + } + if (iter.second->device_res_manager_ == nullptr) { + LOG_ERROR << "device res manager is null"; + } + if (!iter.second->device_res_manager_->BindDeviceToCurrentThread(true)) { + LOG_ERROR << "Bind device failed"; + } + } +} + +DeviceContext *DeviceContextManager::GetOrCreateDeviceContext(const DeviceContextKey &device_context_key) { + std::string device_context_key_str = device_context_key.ToString(); + std::string name = device_context_key.device_name_; + + auto device_context_iter = device_contexts_.find(device_context_key_str); + if (device_context_iter != device_contexts_.end()) { + return device_context_iter->second.get(); + } + + std::shared_ptr device_context; + auto creator_iter = device_context_creators_.find(name); + if (creator_iter != device_context_creators_.end()) { + device_context = (creator_iter->second)(device_context_key); + if (device_context == nullptr) { + LOG_ERROR << "create device context failed"; + } + if (device_context->device_res_manager_ == nullptr) { + LOG_ERROR << "create device res manager failed"; + } + device_contexts_[device_context_key_str] = device_context; + backend_to_device_context_[name] = device_context; + multi_stream_controllers_[name] = + std::make_shared(device_context->device_res_manager_.get()); + } else { + LOG_ERROR << "Create device context failed, please make sure target device:" << name + << " is available, error message of loading plugins: " << GetErrorMsg(); + } + return device_context.get(); +} + +DeviceContextPtr DeviceContextManager::GetDeviceContext(const std::string &device_target) { + if (backend_to_device_context_.count(device_target) == 0) { + LOG_OUT << "Device context of device " << device_target << " is not created yet."; + return nullptr; + } + return backend_to_device_context_[device_target]; +} + +MultiStreamControllerPtr &DeviceContextManager::GetMultiStreamController(const std::string &device_name) { + auto &&iter = multi_stream_controllers_.find(device_name); + if (iter != multi_stream_controllers_.end()) { + return iter->second; + } + LOG_ERROR << "Found multi stream controller failed, and try to initialize, device_name : " << device_name << "."; + // use 0 temporarily. + uint32_t device_id = 0; + DeviceContextKey host_key = {device_name, device_id}; + const auto &real_device_context = GetOrCreateDeviceContext(host_key); + if (real_device_context == nullptr) { + LOG_ERROR << "get or create device context failed"; + } + auto &&iter_again = multi_stream_controllers_.find(device_name); + if (iter_again == multi_stream_controllers_.end()) { + LOG_ERROR << "Get multi stream controller failed, device_name : " << device_name << "."; + } + return iter_again->second; +} + +void DeviceContextManager::WaitTaskFinishOnDevice() const { + for (const auto &item : device_contexts_) { + auto device_context = item.second; + try { + if (device_context != nullptr && !device_context->device_res_manager_->SyncAllStreams()) { + LOG_ERROR << "SyncStream failed"; + return; + } + } catch (const std::exception &ex) { + LOG_ERROR << "SyncStream failed, exception:" << ex.what(); + return; + } + } +} + +void DeviceContextManager::SyncAllStreams() const { + for (const auto &item : device_contexts_) { + auto device_context = item.second; + if (device_context != nullptr && !device_context->device_res_manager_->SyncAllStreams()) { + LOG_ERROR << "SyncStream failed, device info: " << device_context->device_context_key().ToString(); + } + } +} + +std::string DeviceContextManager::GetErrorMsg() const { return dlopen_error_msg_.str(); } +} // namespace device +} // namespace mindspore diff --git a/inferrt/src/hardware/hardware_abstract/device_context_manager.h b/inferrt/src/hardware/hardware_abstract/device_context_manager.h new file mode 100644 index 0000000000000000000000000000000000000000..20757d42dc7330fefc403caa65e6cf94cfff91b8 --- /dev/null +++ b/inferrt/src/hardware/hardware_abstract/device_context_manager.h @@ -0,0 +1,95 @@ +/** + * Copyright 2021-2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_RUNTIME_HARDWARE_DEVICE_CONTEXT_MANAGER_H_ +#define MINDSPORE_CCSRC_RUNTIME_HARDWARE_DEVICE_CONTEXT_MANAGER_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "hardware/hardware_abstract/device_context.h" +#include "hardware/hardware_abstract/visible.h" + +namespace mindspore { +namespace device { +class MultiStreamController; +using DeviceContextCreator = std::function(const DeviceContextKey &)>; +using MultiStreamControllerPtr = std::shared_ptr; + +class HARDWARE_EXPORT DeviceContextManager { + public: + ~DeviceContextManager() = default; + static DeviceContextManager &GetInstance(); + void Register(const std::string &device_name, DeviceContextCreator &&device_context_creator); + DeviceContext *GetOrCreateDeviceContext(const DeviceContextKey &device_context_key); + // Return the device context of the specified device target. + // The difference between this method and 'GetOrCreateDeviceContext' is this method only query device context by + // device target(without device id) since MindSpore only supports 'single process, single device'. + DeviceContextPtr GetDeviceContext(const std::string &device_target); + MultiStreamControllerPtr &GetMultiStreamController(const std::string &device_name); + void ClearDeviceContexts(); + void ChildAfterFork(); + void WaitTaskFinishOnDevice() const; + void SyncAllStreams() const; + void UnloadPlugin(); + std::string GetErrorMsg() const; + void BindDeviceCtx() const; + + private: + DeviceContextManager() = default; + void LoadPlugin(); + bool SelectGpuPlugin(const std::string &cuda_home, const std::set &file_names); + + std::map plugin_maps_; + bool load_init_; + std::string plugin_path_; + + // The string converted from DeviceContextKey -> DeviceContextPtr. + std::map device_contexts_; + // The name of device -> vector of DeviceContextPtr. + std::map backend_to_device_context_; + // The name of device -> DeviceContextCreator. + std::map device_context_creators_; + // record error message of dlopen, print when create device_context failed. + std::stringstream dlopen_error_msg_; + + // Since multi device is not supported currently, here use device target type to improve performance. + // Device target type : 0, 1, 2, 3, and real device support : 'GPU' 'Ascend' 'CPU'. + std::map multi_stream_controllers_; +}; + +class HARDWARE_EXPORT DeviceContextRegister { + public: + DeviceContextRegister(const std::string &device_name, DeviceContextCreator &&runtime_creator) { + DeviceContextManager::GetInstance().Register(device_name, std::move(runtime_creator)); + } + ~DeviceContextRegister() = default; +}; + +#define MS_REGISTER_DEVICE(DEVICE_NAME, DEVICE_CONTEXT_CLASS) \ + static const DeviceContextRegister g_device_##DEVICE_NAME##_reg( \ + DEVICE_NAME, [](const DeviceContextKey &device_context_key) { \ + return std::make_shared(device_context_key); \ + }) +} // namespace device +} // namespace mindspore +#endif // MINDSPORE_CCSRC_RUNTIME_HARDWARE_DEVICE_CONTEXT_MANAGER_H_ diff --git a/inferrt/src/hardware/hardware_abstract/device_event.h b/inferrt/src/hardware/hardware_abstract/device_event.h new file mode 100644 index 0000000000000000000000000000000000000000..224c31bfda6c4e90fb8787d837bb3f64235be296 --- /dev/null +++ b/inferrt/src/hardware/hardware_abstract/device_event.h @@ -0,0 +1,47 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CORE_IR_DEVICE_EVENT_H +#define MINDSPORE_CORE_IR_DEVICE_EVENT_H + +#include +#include + +namespace mindspore { +class DeviceEvent { + public: + virtual ~DeviceEvent() = default; + virtual bool IsReady() const = 0; + virtual void WaitEvent() = 0; + virtual bool WaitEvent(uint32_t stream_id) = 0; + virtual void WaitEventWithoutReset() = 0; + virtual void WaitEventWithoutReset(uint32_t stream_id) {} + virtual void ResetEvent() {} + virtual void ResetEvent(uint32_t stream_id) {} + virtual void RecordEvent() = 0; + virtual void RecordEvent(uint32_t stream_id) = 0; + virtual bool NeedWait() = 0; + virtual void SyncEvent() = 0; + virtual bool QueryEvent() = 0; + virtual void ElapsedTime(float *cost_time, const DeviceEvent *other) = 0; + virtual bool DestroyEvent() = 0; + virtual void set_wait_stream(void *stream) = 0; + virtual void set_record_stream(void *stream) = 0; +}; +using DeviceEventPtr = std::shared_ptr; +using DeviceEventPtrList = std::vector; +} // namespace mindspore +#endif // MINDSPORE_CORE_IR_DEVICE_EVENT_H diff --git a/inferrt/src/hardware/hardware_abstract/dlopen_macro.h b/inferrt/src/hardware/hardware_abstract/dlopen_macro.h new file mode 100644 index 0000000000000000000000000000000000000000..e89786c59809b2fc0824c744762fd94b5679629a --- /dev/null +++ b/inferrt/src/hardware/hardware_abstract/dlopen_macro.h @@ -0,0 +1,184 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_UTILS_DLOPEN_MACRO_H +#define MINDSPORE_CCSRC_UTILS_DLOPEN_MACRO_H + +#ifndef _WIN32 +#include +#else +#include +#undef ERROR +#undef SM_DEBUG +#undef Yield +#endif +#include +#include + +#ifndef _WIN32 +#define PORTABLE_EXPORT __attribute__((visibility("default"))) +#else +#define PORTABLE_EXPORT __declspec(dllexport) +#endif + +constexpr char kSimuSocName[] = "MS_DRY_RUN"; + +template +struct SimuDataFactory { + static T Data() { + static T data{}; + return data; + } +}; + +template +struct SimuDataFactory { + static T *Data() { + static int data{}; + return reinterpret_cast(&data); + } +}; + +template +struct SimuDataFactory { + static T **Data() { + static int data{}; + static T *data_ptr = reinterpret_cast(&data); + return &data_ptr; + } +}; + +template +struct SimuCreateTypeGetter { + typedef T type; +}; + +template +struct SimuCreateTypeGetter { + typedef T type; +}; + +template +struct SimuCreateTypeGetter { + typedef T *type; +}; + +#define PLUGIN_METHOD(name, return_type, ...) \ + extern "C" { \ + PORTABLE_EXPORT return_type Plugin##name(__VA_ARGS__); \ + } \ + constexpr const char *k##name##Name = "Plugin" #name; \ + using name##FunObj = std::function; \ + using name##FunPtr = return_type (*)(__VA_ARGS__); + +#define ORIGIN_METHOD(name, return_type, ...) \ + extern "C" { \ + return_type name(__VA_ARGS__); \ + } \ + constexpr const char *k##name##Name = #name; \ + using name##FunObj = std::function; \ + using name##FunPtr = return_type (*)(__VA_ARGS__); + +#define ORIGIN_METHOD_WITH_SIMU(name, return_type, ...) \ + ORIGIN_METHOD(name, return_type, __VA_ARGS__) \ + template \ + inline T SimuFuncI##name(__VA_ARGS__) { \ + return SimuDataFactory::Data(); \ + } \ + \ + template <> \ + inline void SimuFuncI##name(__VA_ARGS__) {} \ + extern name##FunObj name##_; \ + inline void SimuAssignI##name() { name##_ = SimuFuncI##name; } + +#define ACLRT_GET_SOC_NAME_WITH_SIMU(name, return_type, ...) \ + ORIGIN_METHOD(name, return_type, __VA_ARGS__) \ + template \ + inline T SimuFuncI##name(__VA_ARGS__) { \ + return kSimuSocName; \ + } \ + \ + template <> \ + inline void SimuFuncI##name(__VA_ARGS__) {} \ + extern name##FunObj name##_; \ + inline void SimuAssignI##name() { name##_ = SimuFuncI##name; } + +#define ORIGIN_METHOD_WITH_SIMU_CREATE(name, return_type, create_type_ptr, ...) \ + ORIGIN_METHOD(name, return_type, create_type_ptr, ##__VA_ARGS__) \ + template \ + inline T SimuFuncI##name(U *in_ret, ##__VA_ARGS__) { \ + static U st##name{}; \ + *in_ret = st##name; \ + T ret{}; \ + return ret; \ + } \ + \ + template <> \ + inline aclError SimuFuncI##name(void **in_ret, ##__VA_ARGS__) { \ + static uintptr_t currentPointer = 0; \ + currentPointer += sizeof(void *); \ + *in_ret = reinterpret_cast(currentPointer); \ + return ACL_SUCCESS; \ + } \ + \ + template <> \ + inline void SimuFuncI##name(void **in_ret, ##__VA_ARGS__) { \ + static uintptr_t currentPointer = 0; \ + currentPointer += sizeof(void *); \ + *in_ret = reinterpret_cast(currentPointer); \ + } \ + extern name##FunObj name##_; \ + inline void SimuAssignI##name() { \ + name##_ = SimuFuncI##name::type>; \ + } + +#define ASSIGN_SIMU(name) SimuAssignI##name(); + +inline static std::string GetDlErrorMsg() { +#ifndef _WIN32 + const char *result = dlerror(); + return (result == nullptr) ? "Unknown" : result; +#else + return std::to_string(GetLastError()); +#endif +} + +template +static T DlsymWithCast(void *handle, const char *symbol_name) { +#ifndef _WIN32 + T symbol = reinterpret_cast(reinterpret_cast(dlsym(handle, symbol_name))); +#else + T symbol = reinterpret_cast(GetProcAddress(reinterpret_cast(handle), symbol_name)); +#endif + if (symbol == nullptr) { + std::abort(); + } + return symbol; +} + +#define DlsymFuncObj(func_name, plugin_handle) DlsymWithCast(plugin_handle, k##func_name##Name); + +template +static T DlsymAscend(void *handle, const char *symbol_name) { + T symbol = reinterpret_cast(reinterpret_cast(dlsym(handle, symbol_name))); + if (symbol == nullptr) { + std::abort(); + } + return symbol; +} + +#define DlsymAscendFuncObj(func_name, plugin_handle) DlsymAscend(plugin_handle, k##func_name##Name) +#endif // MINDSPORE_CCSRC_UTILS_DLOPEN_MACRO_H diff --git a/inferrt/src/hardware/hardware_abstract/memory/abstract_dynamic_mem_pool.cc b/inferrt/src/hardware/hardware_abstract/memory/abstract_dynamic_mem_pool.cc new file mode 100644 index 0000000000000000000000000000000000000000..108678c2c249ffc37e7694fecdaf094e43593d5e --- /dev/null +++ b/inferrt/src/hardware/hardware_abstract/memory/abstract_dynamic_mem_pool.cc @@ -0,0 +1,1186 @@ +/** + * Copyright 2024 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "hardware/hardware_abstract/memory/abstract_dynamic_mem_pool.h" + +#include + +#include +#include +#include +#include +#include +#include + +#include "hardware/hardware_abstract/memory/mem_pool_util.h" +#include "common/logger.h" +#include "common/common.h" + +namespace mindspore { +namespace device { +MemBuf::MemBuf(size_t size, void *addr, uint32_t stream_id, MemBlock *mem_block, MemBufStatus status) + : prev_(nullptr), + next_(nullptr), + size_(size), + addr_(addr), + stream_id_(stream_id), + mem_block_(mem_block), + status_(status) {} + +MemBuf::~MemBuf() {} + +MemBufAllocator::~MemBufAllocator() { + LOG_OUT << "MemBufAllocator[" << this << "] : " << BriefInfo() << " deconstruct."; + for (auto &mem_block : mem_blocks_) { + delete mem_block; + } + mem_blocks_.clear(); + for (auto mem_buf : free_mem_bufs_) { + delete mem_buf; + } + free_mem_bufs_.clear(); + for (auto mem_buf : eager_free_mem_bufs_) { + delete mem_buf; + } + eager_free_mem_bufs_.clear(); + delete search_key_; +} + +void MemBufAllocator::ReleaseDeviceRes() { + LOG_OUT << "Release device resource for allocator, " << BriefInfo() << ", mem_blocks_ size : " << mem_blocks_.size() + << "."; + for (auto mem_block : mem_blocks_) { + LOG_OUT << "Clean mem block : " << mem_block->ToJson() << "."; + (void)mem_block_cleaner_(mem_block); + } + for (auto mem_block : mem_blocks_) { + LOG_OUT << "Delete mem block : " << mem_block->ToJson() << "."; + delete mem_block; + } + mem_blocks_.clear(); + + LOG_OUT << "Free mem buf size : " << free_mem_bufs_.size() << "."; + for (auto mem_buf : free_mem_bufs_) { + delete mem_buf; + } + free_mem_bufs_.clear(); + + LOG_OUT << "Eager free mem buf size : " << eager_free_mem_bufs_.size() << "."; + for (auto mem_buf : eager_free_mem_bufs_) { + delete mem_buf; + } + eager_free_mem_bufs_.clear(); +} + +MemBuf *MemBufAllocator::Malloc(size_t size) { + // Malloc with expand block first. + if (MS_UNLIKELY(mem_blocks_.empty())) { + return MallocExpandBlock(size); + } + + search_key_->size_ = size; + auto it = free_mem_bufs_.lower_bound(search_key_); + MemBuf *candidate = nullptr; + // 1. Try to find in free mem bufs. + if (MS_LIKELY(it != free_mem_bufs_.end())) { + candidate = *it; + (void)free_mem_bufs_.erase(it); + return MapAndSplitMemBuf(candidate, size); + } + // 2. Try to search available buf, free and eager free buf. + candidate = SearchAvailableMemBuf(size); + if (MS_UNLIKELY(candidate != nullptr)) { + return candidate; + } + // 3. Try to find in eager free mem bufs. + it = eager_free_mem_bufs_.lower_bound(search_key_); + if (it != eager_free_mem_bufs_.end()) { + candidate = *it; + (void)eager_free_mem_bufs_.erase(it); + return MapAndSplitMemBuf(candidate, size); + } + + return nullptr; +} + +inline MemBuf *MemBufAllocator::SearchAvailableMemBuf(size_t size) { + if (!enable_eager_free_ || MS_UNLIKELY(is_customized_)) { + return nullptr; + } + // Search from back to front, because the free mem buf is sorted by size. + // More efficient way is to search more candidates, do it in the next version. + for (auto backward_it = free_mem_bufs_.rbegin(); backward_it != free_mem_bufs_.rend(); backward_it++) { + auto mem_buf = *backward_it; + auto next_buf = mem_buf->next_; + if (next_buf == nullptr || next_buf->status_ != MemBufStatus::kMemBufEagerFree || + mem_buf->size_ + next_buf->size_ < size) { + continue; + } + + // Located candidates, try map and split. + auto need_map_size = size - mem_buf->size_; + auto mapped_size = mem_mapper_(need_map_size, next_buf->addr_); + if (mapped_size != need_map_size) { + LOG_OUT << "Map mem buf : " << mem_buf->ToJson() << ", next buf : " << next_buf->ToJson() << ", size : " << size + << ", need_map_size : " << need_map_size << ", mapped_size : " << mapped_size << " failed."; + return nullptr; + } + // Update mem buf. + free_mem_bufs_.erase(mem_buf); + mem_buf->size_ = size; + mem_buf->status_ = MemBufStatus::kMemBufUsed; + // Remove eager free buf and try update it. + eager_free_mem_bufs_.erase(next_buf); + next_buf->addr_ = static_cast(next_buf->addr_) + need_map_size; + next_buf->size_ = next_buf->size_ - need_map_size; + // If next buf is empty, remove it or update remain eager free mem buf. + if (next_buf->size_ == 0) { + mem_buf->next_ = next_buf->next_; + if (next_buf->next_ != nullptr) { + next_buf->next_->prev_ = mem_buf; + } + delete next_buf; + } else { + eager_free_mem_bufs_.insert(next_buf); + } + return mem_buf; + } + return nullptr; +} + +bool MemBufAllocator::Free(MemBuf *mem_buf, MemBufStatus target_status) { + // Change mem buf status to used by event, and wait for event to free. + if (MS_UNLIKELY(!mem_buf->IsEventNotUsed())) { + mem_buf->status_ = MemBufStatus::kMemBufUsedByEvent; + return false; + } + + mem_buf->status_ = target_status; + // Try to merge from prev. + auto prev_buf = mem_buf->prev_; + if (MS_LIKELY(prev_buf != nullptr && prev_buf->status_ == target_status)) { + // Erase prev buf pointer + auto prev = prev_buf->prev_; + mem_buf->prev_ = prev; + if (prev != nullptr) { + prev->next_ = mem_buf; + } + + mem_buf->addr_ = prev_buf->addr_; + mem_buf->size_ += prev_buf->size_; + if (target_status == MemBufStatus::kMemBufIdle) { + auto ret = free_mem_bufs_.erase(prev_buf); + if (ret == 0) { + LOG_ERROR << "Erase mem buf : " << mem_buf->ToJson() << " prev buf " << prev_buf->ToJson() << " failed."; + } + } else if (target_status == MemBufStatus::kMemBufEagerFree) { + auto ret = eager_free_mem_bufs_.erase(prev_buf); + if (ret == 0) { + LOG_ERROR << "Erase mem buf : " << mem_buf->ToJson() << " prev buf " << prev_buf->ToJson() << " failed."; + } + } + delete prev_buf; + } + // Try to merge from next. + auto next_buf = mem_buf->next_; + if (MS_LIKELY(next_buf != nullptr && next_buf->status_ == target_status)) { + // Erase next buf pointer + auto next = next_buf->next_; + mem_buf->next_ = next; + if (next != nullptr) { + next->prev_ = mem_buf; + } + + mem_buf->size_ += next_buf->size_; + if (target_status == MemBufStatus::kMemBufIdle) { + auto ret = free_mem_bufs_.erase(next_buf); + if (ret == 0) { + LOG_ERROR << "Erase next buf : " << next_buf->ToJson() << " failed."; + } + } else if (target_status == MemBufStatus::kMemBufEagerFree) { + auto ret = eager_free_mem_bufs_.erase(next_buf); + if (ret == 0) { + LOG_ERROR << "Erase next buf : " << next_buf->ToJson() << " failed."; + } + } + delete next_buf; + } + + if (target_status == MemBufStatus::kMemBufIdle) { + (void)free_mem_bufs_.emplace(mem_buf); + } else if (target_status == MemBufStatus::kMemBufEagerFree) { + (void)eager_free_mem_bufs_.emplace(mem_buf); + } + + return true; +} + +MemBuf *MemBufAllocator::MallocExpandBlock(size_t size) { + MemBlock *mem_block = ExpandBlock(size); + if (mem_block == nullptr) { + return nullptr; + } + MemBuf *candidate = new MemBuf( + mem_block->size_, mem_block->addr_, mem_block->stream_id_, mem_block, + MS_LIKELY(!is_customized_) && enable_eager_free_ ? MemBufStatus::kMemBufEagerFree : MemBufStatus::kMemBufIdle); + if (candidate->size_ < size) { + if (candidate->status_ == MemBufStatus::kMemBufIdle) { + (void)free_mem_bufs_.emplace(candidate); + } else { + (void)eager_free_mem_bufs_.emplace(candidate); + } + LOG_OUT << "Candidate size: " << candidate->size_ << " is less than required size : " << size << "."; + return nullptr; + } + + return MapAndSplitMemBuf(candidate, size); +} + +void MemBufAllocator::Initialize(size_t size) { + LOG_OUT << "Initialize allocator : " << BriefInfo() << " with size : " << size << "."; + if (enable_eager_free_ || MS_UNLIKELY(is_customized_)) { + LOG_OUT << "Skip initialization of allocator, since vmm is enabled."; + return; + } + MemBlock *mem_block = ExpandBlock(size); + if (mem_block == nullptr) { + LOG_OUT << "Initialize allocator failed, size : " << size << "."; + return; + } + MemBuf *mem_buf = + new MemBuf(mem_block->size_, mem_block->addr_, mem_block->stream_id_, mem_block, MemBufStatus::kMemBufIdle); + (void)free_mem_bufs_.emplace(mem_buf); +} + +const std::pair MemBufAllocator::FreeIdleMemsByEagerFree() { + // Free all idle mem bufs. + size_t eager_free_size = 0; + for (auto mem_buf : free_mem_bufs_) { + eager_free_size += mem_buf->size_; + Free(mem_buf, MemBufStatus::kMemBufEagerFree); + } + free_mem_bufs_.clear(); + // Do eager free on eager free mem bufs. + size_t real_free_size = 0; + for (auto mem_buf : eager_free_mem_bufs_) { + LOG_OUT << "Eager free mem buf : " << mem_buf << ", details : " << mem_buf->ToJson() << "."; + real_free_size += mem_eager_freer_(mem_buf->addr_, mem_buf->size_); + } + LOG_OUT << "Free idle mems by eager free, eager_free_size : " << eager_free_size + << ", real_free_size : " << real_free_size << "."; + return std::make_pair(eager_free_size, real_free_size); +} + +size_t MemBufAllocator::ReleaseFreeBlocks() { + size_t release_size = 0; + for (auto iter = mem_blocks_.begin(); iter != mem_blocks_.end();) { + auto mem_block = *iter; + MemBuf mem_buf(mem_block->size_, mem_block->addr_, mem_block->stream_id_, mem_block, MemBufStatus::kMemBufIdle); + // Judge if mem block in free mem bufs. + auto &&it = free_mem_bufs_.find(&mem_buf); + if (it == free_mem_bufs_.end()) { + iter++; + continue; + } + auto mem_buf_it = *it; + if (mem_buf_it->addr_ == mem_block->addr_ && mem_buf_it->size_ == mem_block->size_) { + LOG_OUT << "Release mem block : " << mem_block->ToJson() << "."; + bool ret = mem_block_cleaner_(mem_block); + if (!ret) { + LOG_OUT << "Clean mem block : " << mem_block->ToJson() << " failed."; + iter++; + continue; + } + free_mem_bufs_.erase(it); + delete mem_buf_it; + release_size += mem_block->size_; + delete mem_block; + iter = mem_blocks_.erase(iter); + } else { + iter++; + } + } + return release_size; +} + +inline MemBuf *MemBufAllocator::MapAndSplitMemBuf(MemBuf *candidate, size_t size) { + size_t remaining_size = candidate->size_ - size; + // Mmap memory first. + if (candidate->status_ == MemBufStatus::kMemBufEagerFree) { + size_t map_size = (remaining_size >= kDynamicMemAlignSize) ? size : candidate->size_; + auto mapped_size = mem_mapper_(map_size, candidate->addr_); + if (mapped_size != map_size) { + LOG_OUT << "Mapped_size : " << mapped_size << " is not equal to required size : " << map_size + << ", mem buf info : " << candidate->ToJson() << "."; + (void)eager_free_mem_bufs_.emplace(candidate); + return nullptr; + } + } + + bool need_split = remaining_size >= kDynamicMemAlignSize; + // Try to split mem buf. + if (MS_LIKELY(need_split)) { + void *remaining_addr = static_cast(candidate->addr_) + size; + auto remaining_buf = + new MemBuf(remaining_size, remaining_addr, candidate->stream_id_, candidate->mem_block_, candidate->status_); + + auto next = candidate->next_; + if (next != nullptr) { + next->prev_ = remaining_buf; + remaining_buf->next_ = next; + } + candidate->next_ = remaining_buf; + remaining_buf->prev_ = candidate; + if (remaining_buf->status_ == MemBufStatus::kMemBufIdle) { + (void)free_mem_bufs_.emplace(remaining_buf); + } else { + (void)eager_free_mem_bufs_.emplace(remaining_buf); + } + + // Update candidate size. + candidate->size_ = size; + } + + candidate->status_ = MemBufStatus::kMemBufUsed; + // Update mem block usage. + candidate->mem_block_->UpdateBorderAddr(candidate); + + return candidate; +} + +inline MemBlock *MemBufAllocator::ExpandBlock(size_t size) { + MemBlock *mem_block = mem_block_expander_(size); + if (mem_block == nullptr) { + LOG_OUT << "Expand block failed, expand size : " << size << ", memory is not enough."; + return nullptr; + } + + if (mem_block->size_ < size) { + LOG_OUT << "Expand block failed, expand size : " << mem_block->size_ << " is less than require size : " << size + << "."; + } + + (void)mem_blocks_.emplace_back(mem_block); + return mem_block; +} + +AbstractDynamicMemPool::AbstractDynamicMemPool() {} + +void AbstractDynamicMemPool::Initialize(size_t init_size, size_t increase_size, size_t max_size) { + if (init_size == 0) { + LOG_OUT << "Skip initialization of memory pool since init size is not configured."; + return; + } + + LockGuard lock(lock_); + LOG_OUT << "Initialize dynamic memory pool, init size : " << init_size << ", increase size : " << increase_size + << ", max size : " << max_size << "."; + init_size_ = init_size >> 1; + increase_size_ = increase_size; + max_size_ = max_size; + + // Do initialization with init size. + auto persistent_allocator = GetMemBufAllocator(init_size_, true, kDefaultStreamIndex); + persistent_allocator->Initialize(AlignMemorySize(init_size_)); + auto common_allocator = GetMemBufAllocator(init_size_, false, kDefaultStreamIndex); + common_allocator->Initialize(AlignMemorySize(init_size_)); +} + +void AbstractDynamicMemPool::ReleaseDeviceRes() { + LockGuard lock(lock_); + for (const auto &iter : stream_pair_mem_bufs_) { + auto size = iter.second.size(); + LOG_OUT << "Event referred stream_pair_mem_bufs_[" << iter.first.first << "-" << iter.first.second + << "], size : " << size << "."; + } + // Clear map of address to mem buf. + for (const auto &iter : addr_mem_buf_allocators_) { + auto mem_buf = iter.second.first; + delete mem_buf; + } + addr_mem_buf_allocators_.clear(); + + LOG_OUT << "Release device resource for " << GetMemoryPoolType() << " : " << mem_stat_.ToReadableString() << "."; + for (const auto &stream_id_allocator : stream_id_allocators_) { + const auto &allocator = stream_id_allocator.second; + allocator->ReleaseDeviceRes(); + } + for (const auto &customized_allocator : customized_allocators_) { + const auto &allocator = customized_allocator.second; + allocator->ReleaseDeviceRes(); + } + stream_id_allocators_.clear(); + customized_allocators_.clear(); + stream_pair_mem_bufs_.clear(); + mem_stat_.Reset(); +} + +/** + * @brief Alloc tensor mem. + * Allocation follow steps below: + * 1 align size + * 2 find from current allocator, if failed transfer to 3 + * 3 find from another allocator, if failed transfer to 4 + * 4 do eager free and find from current allocator again, if failed transfer to 5 + * 5 expand block + */ +DeviceMemPtr AbstractDynamicMemPool::AllocTensorMem(size_t size, bool from_persistent_mem, bool, uint32_t stream_id) { + size_t align_size = AlignMemorySize(size); + LockGuard lock(lock_); + auto &&mem_buf_allocator = AllocMemBuf(align_size, from_persistent_mem, stream_id); + if (MS_UNLIKELY(mem_buf_allocator.first == nullptr)) { + return nullptr; + } + + (void)addr_mem_buf_allocators_.emplace(mem_buf_allocator.first->addr_, mem_buf_allocator); + return mem_buf_allocator.first->addr_; +} + +/** + * @brief Alloc mem buf. + * Strategy when vmm is disable: + * Persistent memory: First malloc form its own pool, if fails, try to malloc from common pool. + * Common memory: First malloc from its own pool, if fails, it will try to expand the pool. + * If the expansion fails, try to malloc from persistent pool. + */ +inline std::pair AbstractDynamicMemPool::AllocMemBuf(size_t align_size, + bool from_persistent_mem, + uint32_t stream_id) { + auto allocator = GetMemBufAllocator(align_size, from_persistent_mem, stream_id); + + auto mem_buf = allocator->Malloc(align_size); + if (MS_UNLIKELY(mem_buf == nullptr)) { + // Enable malloc from another allocator when from_persistent_mem is true and vmm is not enabled. + if (!enable_vmm_ && from_persistent_mem && MS_LIKELY(!enable_custom_allocator_)) { + auto common_allocator = GetMemBufAllocator(align_size, false, stream_id); + mem_buf = common_allocator->Malloc(align_size); + allocator = common_allocator; + } + + if (MS_UNLIKELY(mem_buf == nullptr)) { + if ((enable_vmm_ || IsEnableEagerFree()) && MS_LIKELY(!enable_custom_allocator_)) { + WaitPipelineHelper(); + if (!SyncAllStreams()) { + LOG_ERROR << "Sync all streams failed."; + return std::make_pair(nullptr, nullptr); + } + (void)FreeIdleMemsByEagerFree(); + mem_buf = allocator->Malloc(align_size); + } + if (MS_UNLIKELY(mem_buf == nullptr)) { + mem_buf = allocator->MallocExpandBlock(align_size); + if (MS_UNLIKELY(mem_buf == nullptr)) { + if (MS_LIKELY(!from_persistent_mem) && MS_LIKELY(!enable_custom_allocator_)) { + // Common pool expand block failed, try to malloc from persistent pool. + auto persistent_allocator = GetMemBufAllocator(align_size, true, stream_id); + mem_buf = persistent_allocator->Malloc(align_size); + if (MS_LIKELY(mem_buf != nullptr)) { + allocator = persistent_allocator; + } + } + + if (MS_UNLIKELY(mem_buf == nullptr)) { + LOG_OUT << "Alloc tensor mem failed and try to sync all events to release memory."; + (void)DoSyncAllEvents(); + mem_buf = allocator->Malloc(align_size); + if (MS_UNLIKELY(mem_buf == nullptr)) { + return std::make_pair(nullptr, nullptr); + } + } + } + } + } + } + + // Update stat. + mem_stat_.used_size_ += mem_buf->size_; + mem_stat_.UpdatePeakSize(enable_vmm_, GetVmmUsedMemSize()); + return std::make_pair(mem_buf, allocator); +} + +std::vector AbstractDynamicMemPool::AllocContinuousTensorMem(const std::vector &size_list, + uint32_t stream_id) { + std::vector device_addr_list; + size_t total_size = std::accumulate(size_list.begin(), size_list.end(), static_cast(0)); + // Pre-alloc the one whole piece memory. + auto device_addr = AbstractDynamicMemPool::AllocTensorMem(total_size, false, false, stream_id); + if (device_addr == nullptr) { + return device_addr_list; + } + + (void)device_addr_list.emplace_back(device_addr); + if (size_list.size() == 1) { + return device_addr_list; + } + + // Try to split mem bufs. + LockGuard lock(lock_); + auto &&it = addr_mem_buf_allocators_.find(device_addr); + if (it != addr_mem_buf_allocators_.end()) { + auto mem_buf = it->second.first; + auto allocator = it->second.second; + mem_buf->size_ = size_list[0]; + MemBuf *prev_mem_buf = mem_buf; + void *next_addr = static_cast(mem_buf->addr_) + size_list[0]; + total_size -= size_list[0]; + for (size_t i = 1; i < size_list.size(); i++) { + auto new_mem_buf = new MemBuf(size_list[i], next_addr, stream_id, mem_buf->mem_block_, MemBufStatus::kMemBufUsed); + new_mem_buf->Link(prev_mem_buf, prev_mem_buf->next_); + (void)addr_mem_buf_allocators_.emplace(new_mem_buf->addr_, std::make_pair(new_mem_buf, allocator)); + // Update result. + (void)device_addr_list.emplace_back(next_addr); + // Update next addr and prev mem buf. + if (i < size_list.size() - 1) { + next_addr = static_cast(next_addr) + size_list[i]; + total_size -= size_list[i]; + prev_mem_buf = new_mem_buf; + } else { + // Update last mem buf + if (total_size != size_list[i]) { + LOG_OUT << "Remain size : " << total_size << " is not equal to last size : " << size_list[i] << "."; + new_mem_buf->size_ = total_size; + } + } + } + } else { + // Unreachable routine. + LOG_ERROR << "Find addr : " << device_addr << " failed."; + } + + return device_addr_list; +} + +// The main program entry of memory free. +void AbstractDynamicMemPool::FreeTensorMem(const DeviceMemPtr &device_addr) { + LockGuard lock(lock_); + (void)DoFreeTensorMem(device_addr); +} + +// The main program entry of memory free. +bool AbstractDynamicMemPool::DoFreeTensorMem(const DeviceMemPtr &device_addr) { + void *addr = device_addr; + auto &&it = addr_mem_buf_allocators_.find(device_addr); + if (MS_LIKELY(it != addr_mem_buf_allocators_.end())) { + auto allocator = it->second.second; + auto mem_buf = it->second.first; + auto free_size = mem_buf->size_; + if (MS_LIKELY(allocator->Free(mem_buf))) { + mem_stat_.used_size_ -= free_size; + (void)addr_mem_buf_allocators_.erase(it); + return true; + } + } else { + // This may be normal case. + LOG_OUT << "Free tensor mem failed, can not find address : " << addr << "."; + } + return false; +} + +inline MemBufAllocator *AbstractDynamicMemPool::GetMemBufAllocator(size_t size, bool from_persistent_mem, + uint32_t stream_id) { + // Not use small pool. + const AllocatorInfo key{stream_id, from_persistent_mem, false}; + LOG_OUT << "Get allocator, " << key.ToString() << "."; + + MemBufAllocatorPtr allocator = nullptr; + + auto &&it = stream_id_allocators_.find(key); + if (it == stream_id_allocators_.end()) { + allocator = GenerateAllocator(key); + (void)stream_id_allocators_.emplace(key, allocator); + } else { + allocator = it->second; + } + return allocator.get(); +} + +// Keep addrs is in free addrs, so here find mem bufs first. +// And then, traverse keep addrs and spilt candidates. +void AbstractDynamicMemPool::FreePartTensorMems(const std::vector &free_addrs, + const std::vector &keep_addrs, + const std::vector &keep_addr_sizes) { + LOG_OUT << "Free part tensor mems."; + LockGuard lock(lock_); + (void)DoFreePartTensorMems(free_addrs, keep_addrs, keep_addr_sizes); +} + +std::vector AbstractDynamicMemPool::DoFreePartTensorMems(const std::vector &free_addrs, + const std::vector &keep_addrs, + const std::vector &keep_addr_sizes) { + std::vector mem_bufs; + std::map> candidates; + for (const auto &free_addr : free_addrs) { + auto &&it = addr_mem_buf_allocators_.find(free_addr); + if (it != addr_mem_buf_allocators_.end()) { + (void)candidates.emplace(it->first, it->second); + } else { + // This is illegal routine, but level0 case entered. + LOG_OUT << "Find address : " << free_addr << " failed."; + } + } + + std::set processed_keep_addrs; + for (size_t i = 0; i < keep_addrs.size(); i++) { + auto keep_addr = keep_addrs[i]; + std::uintptr_t keep_addr_to_size = reinterpret_cast(keep_addr); + if (processed_keep_addrs.count(keep_addr_to_size) > 0) { + LOG_OUT << "Duplicate keep address : " << keep_addr << "."; + continue; + } + (void)processed_keep_addrs.insert(keep_addr_to_size); + auto &&it = candidates.upper_bound(keep_addr); + if (it == candidates.begin()) { + LOG_OUT << "Locate keep addr : " << keep_addr << " failed."; + continue; + } + auto iter = --it; + auto mem_buf = iter->second.first; + auto allocator = iter->second.second; + std::uintptr_t base_start = reinterpret_cast(mem_buf->addr_); + std::uintptr_t base_end = base_start + mem_buf->size_; + std::uintptr_t keep_start = keep_addr_to_size; + std::uintptr_t keep_end = keep_start + keep_addr_sizes[i]; + // Since free part tensor mem may double free keep addr, continue for these keep addrs. + if (keep_start >= base_end) { + LOG_OUT << "Check range error, base start : " << base_start << ", base end : " << base_end + << ", keep start : " << keep_start << ", keep end : " << keep_end << "."; + continue; + } + // Split candidates. If keep start equal to base start, split mem buf into two parts, or three parts. + // First construct keep mem buf and set it into addr_mem_buf_allocators_, then process head buf and tail buf. + MemBuf *keep_mem_buf = nullptr; + if (keep_start == base_start) { + keep_mem_buf = mem_buf; + keep_mem_buf->size_ = keep_addr_sizes[i]; + // Remove keep addr since keep start equal to base start, no need to free keep addr any more. + (void)candidates.erase(mem_buf->addr_); + } else { + // Split middle mem buf. + keep_mem_buf = + new MemBuf(keep_addr_sizes[i], keep_addr, mem_buf->stream_id_, mem_buf->mem_block_, mem_buf->status_); + keep_mem_buf->Link(mem_buf, mem_buf->next_); + (void)addr_mem_buf_allocators_.emplace(keep_addr, std::make_pair(keep_mem_buf, allocator)); + std::uintptr_t prev_remain_size = keep_start - base_start; + mem_buf->size_ = prev_remain_size; + } + (void)mem_bufs.emplace_back(keep_mem_buf); + LOG_OUT << "keep_mem_buf : " << keep_mem_buf->ToJson() << "."; + // Process last mem buf. + if (keep_end < base_end) { + void *last_addr = static_cast(keep_mem_buf->addr_) + keep_mem_buf->size_; + auto last_mem_buf = new MemBuf(base_end - keep_end, last_addr, keep_mem_buf->stream_id_, keep_mem_buf->mem_block_, + mem_buf->status_); + last_mem_buf->Link(keep_mem_buf, keep_mem_buf->next_); + (void)addr_mem_buf_allocators_.emplace(last_mem_buf->addr_, std::make_pair(last_mem_buf, allocator)); + if (candidates.count(last_mem_buf->addr_) > 0) { + LOG_OUT << "Duplicate address : " << last_mem_buf->addr_ << "."; + } + LOG_OUT << "last mem buf : " << last_mem_buf->ToJson() << "."; + (void)candidates.emplace(last_mem_buf->addr_, std::make_pair(last_mem_buf, allocator)); + } + } + for (const auto &candidate : candidates) { + auto mem_buf = candidate.second.first; + if (!AbstractDynamicMemPool::DoFreeTensorMem(mem_buf->addr_)) { + LOG_ERROR << "Free device address failed : " << mem_buf->addr_ << ", mem_buf : " << mem_buf->ToJson() << "."; + } + } + return mem_bufs; +} + +MemBufAllocatorPtr AbstractDynamicMemPool::GenerateAllocator(const AllocatorInfo &allocator_key) { + const auto is_persistent = allocator_key.from_persistent_mem; + const auto stream_id = allocator_key.stream_id; + const auto is_small = allocator_key.use_small_pool; + + LOG_OUT << "Generate allocator, " << allocator_key.ToString() << "."; + std::function mem_block_expander = [&, is_persistent = is_persistent, + stream_id = stream_id](size_t size) { + size_t block_size = CalMemBlockAllocSize(size, is_persistent); + MemBlock *mem_block = nullptr; + if (block_size == 0) { + LOG_OUT << "Malloc mem block failed, is enable eager free : " << IsEnableEagerFree() + << ", is enable vmm : " << IsEnableVmm() << ", size : " << size << ", block size is 0."; + return mem_block; + } + DeviceMemPtr addr = nullptr; + size_t alloc_size; + LOG_OUT << "Malloc mem block, is enable eager free : " << IsEnableEagerFree() + << ", is enable vmm : " << IsEnableVmm() << ", size : " << size << ", block size : " << block_size << "."; + if (IsEnableVmm() || IsEnableEagerFree()) { + // Virtual address is unlimited. + auto eager_free_size = std::max(block_size, static_cast(total_mem_size())); + alloc_size = AllocDeviceMemByEagerFree(eager_free_size, &addr); + mem_stat_.eager_free_size_ += alloc_size; + } else { + alloc_size = AllocDeviceMem(block_size, &addr); + if (alloc_size < block_size) { + LOG_OUT << "Alloc device mem failed, alloc size : " << alloc_size << ", block size : " << block_size << "."; + } + } + if (alloc_size == 0) { + return mem_block; + } + mem_stat_.alloc_size_ += alloc_size; + mem_block = new MemBlock(alloc_size, addr, stream_id); + LOG_OUT << "Malloc mem block : " << mem_block->ToJson() << "."; + return mem_block; + }; + + std::function mem_block_cleaner = [&](MemBlock *mem_block) { + mem_stat_.alloc_size_ -= mem_block->size_; + // Call free device mem as ascend memory pool would do stat in free operation. + return FreeDeviceMem(mem_block->addr_); + }; + std::function mem_mapper = [&](size_t size, void *addr) { + mem_stat_.eager_free_size_ -= size; + return MmapDeviceMem(size, addr); + }; + std::function mem_eager_freer = [&](void *addr, const size_t size) { + LOG_OUT << "Eager free addr : " << addr << ", size : " << size << "."; + return FreeDeviceMemByEagerFree(addr, size); + }; + + return std::make_shared(mem_block_expander, mem_block_cleaner, mem_mapper, mem_eager_freer, + IsEnableVmm() || IsEnableEagerFree(), is_persistent, stream_id, is_small); +} + +// Element in vector : +bool AbstractDynamicMemPool::RecordEvent(int64_t task_id_on_stream, uint32_t user_stream_id, + const std::vector> &memory_stream_addresses, + const DeviceEventPtr &event) { + LOG_OUT << "Record event for task id on stream : " << task_id_on_stream << ", user stream id : " << user_stream_id + << "."; + LockGuard lock(lock_); + for (auto &[memory_stream_id, addr] : memory_stream_addresses) { + auto &&it = addr_mem_buf_allocators_.find(addr); + if (it != addr_mem_buf_allocators_.end()) { + auto mem_buf = it->second.first; + if (mem_buf->IsEventNotUsed()) { + mem_stat_.used_by_event_size_ += mem_buf->size_; + } + LOG_OUT << "Record event for : " << mem_buf->ToJson() << "."; + (void)mem_buf->RecordEvent(task_id_on_stream, user_stream_id, event); + (void)stream_pair_mem_bufs_[std::make_pair(user_stream_id, memory_stream_id)].emplace(mem_buf); + } else { + // Output of somas sub graph may be used by somas sub graph inner node, address may not be kept in mem pool. + LOG_OUT << "Unknown address : " << addr << "."; + } + } + return true; +} + +bool AbstractDynamicMemPool::WaitEvent(int64_t task_id_on_stream, uint32_t user_stream_id, uint32_t memory_stream_id) { + LOG_OUT << "Wait event for task id on stream : " << task_id_on_stream << ", user stream id : " << user_stream_id + << ", memory stream id : " << memory_stream_id << "."; + LockGuard lock(lock_); + auto key = std::make_pair(user_stream_id, memory_stream_id); + auto iter = stream_pair_mem_bufs_.find(key); + if (iter == stream_pair_mem_bufs_.end()) { + return false; + } + + auto mem_bufs_ = iter->second; + for (const auto &mem_buf : mem_bufs_) { + LOG_OUT << "Wait event for : " << mem_buf->ToJson() << "."; + mem_buf->WaitEvent(task_id_on_stream, user_stream_id); + // Remove event and try to free memory. + if (mem_buf->IsEventNotUsed()) { + mem_stat_.used_by_event_size_ -= mem_buf->size_; + // Force clear all mem bufs. + for (auto &stream_pair_mem_bufs : stream_pair_mem_bufs_) { + (void)stream_pair_mem_bufs.second.erase(mem_buf); + } + if (mem_buf->status_ == DynamicMemBufStatus::kMemBufUsedByEvent) { + (void)DoFreeTensorMem(mem_buf->addr_); + } + } + } + return true; +} + +bool AbstractDynamicMemPool::WaitEvent(int64_t task_id_on_stream, uint32_t memory_stream_id) { + LOG_OUT << "Wait event for task id on stream : " << task_id_on_stream << ", memory stream id : " << memory_stream_id + << "."; + LockGuard lock(lock_); + for (auto &stream_pair_mem_bufs : stream_pair_mem_bufs_) { + const auto &[user_stream, memory_stream] = stream_pair_mem_bufs.first; + if (memory_stream != memory_stream_id) { + continue; + } + auto mem_bufs = stream_pair_mem_bufs.second; + for (const auto &mem_buf : mem_bufs) { + LOG_OUT << "Wait event for : " << mem_buf->ToJson() << "."; + mem_buf->WaitEvent(task_id_on_stream, user_stream); + // Remove event and try to free memory. + if (mem_buf->IsEventNotUsed()) { + mem_stat_.used_by_event_size_ -= mem_buf->size_; + // Force clear all mem bufs. + for (auto &kv : stream_pair_mem_bufs_) { + (void)kv.second.erase(mem_buf); + } + if (mem_buf->status_ == DynamicMemBufStatus::kMemBufUsedByEvent) { + (void)DoFreeTensorMem(mem_buf->addr_); + } + } + } + } + return true; +} + +bool AbstractDynamicMemPool::SyncAllEvents() { + LOG_OUT << "Sync all events, stream_pair_addresses_ size : " << stream_pair_mem_bufs_.size() << "."; + LockGuard lock(lock_); + return DoSyncAllEvents(); +} + +bool AbstractDynamicMemPool::DoSyncAllEvents() { + if (stream_pair_mem_bufs_.empty()) { + return false; + } + + std::set carry_event_mem_bufs; + for (const auto &stream_pair_mem_buf : stream_pair_mem_bufs_) { + for (const auto &mem_buf : stream_pair_mem_buf.second) { + (void)carry_event_mem_bufs.emplace(mem_buf); + } + } + for (auto &mem_buf : carry_event_mem_bufs) { + if (mem_buf->SyncAllEvents() && mem_buf->status_ == DynamicMemBufStatus::kMemBufUsedByEvent) { + (void)DoFreeTensorMem(mem_buf->addr_); + } + } + + stream_pair_mem_bufs_.clear(); + return true; +} + +size_t AbstractDynamicMemPool::CalMemBlockAllocSize(size_t size, bool from_persistent_mem, bool) { + auto device_free_mem_size = free_mem_size(); + // Make sure available mem is enough. + if (device_free_mem_size < size) { + LOG_OUT << "Memory not enough: current free memory size[" << device_free_mem_size + << "] is smaller than required size[" << size << "]."; + return 0; + } + auto unit_size = MemAllocUnitSize(from_persistent_mem); + if (device_free_mem_size < unit_size) { + LOG_OUT << "Device memory size [" << device_free_mem_size << "] is smaller than unit size [" << unit_size << "]."; + } + // Calculate alloc size. + size_t alloc_size = unit_size; + if (size > unit_size) { + alloc_size = ((size + unit_size - 1) / unit_size) * unit_size; + } + return std::min(alloc_size, device_free_mem_size); +} + +void AbstractDynamicMemPool::DefragMemory() { + LOG_OUT << "Try to defrag memory."; + LockGuard lock(lock_); + + if (!enable_vmm_) { + LOG_OUT << "Skip defrag memory since vmm is not enabled."; + return; + } + + if (eager_free_count_ == 0) { + LOG_OUT << "Exit defrag memory since eager free count is 0."; + return; + } + if (last_eager_free_count_ == eager_free_count_) { + LOG_OUT << "Exit defrag memory since last eager free count equals to eager free count : " << last_eager_free_count_ + << "."; + return; + } + + LOG_OUT << "Defrag memory start."; + WaitPipelineHelper(); + if (!SyncAllStreams()) { + LOG_ERROR << "Sync all streams failed."; + return; + } + const auto [eager_free_size, real_free_size] = FreeIdleMemsByEagerFree(); + LOG_OUT << "Defrag memory, eager_free_size : " << eager_free_size << ", real_free_size : " << real_free_size << "."; + last_eager_free_count_ = eager_free_count_; +} + +void AbstractDynamicMemPool::WaitPipelineHelper() { + if (pipeline_callback_) { + lock_.unlock(); + pipeline_callback_(); + lock_.lock(); + } +} + +std::string AbstractDynamicMemPool::DynamicMemPoolStateInfo() const { + std::stringstream ss; + // Classify mem buf and stat mem buf state info. + size_t mem_buf_used_stat[static_cast(memory::mem_pool::MemType::kOther) + 1] = {0}; + struct AddrComparator { + bool operator()(MemBuf *const &left, MemBuf *const &right) const { return left->addr_ < right->addr_; } + }; + std::map> allocator_mem_bufs; + for (const auto &addr_mem_buf_allocator : addr_mem_buf_allocators_) { + const auto allocator = addr_mem_buf_allocator.second.second; + const auto mem_buf = addr_mem_buf_allocator.second.first; + mem_buf_used_stat[static_cast(mem_buf->alloc_type_)] += mem_buf->size_; + auto &mem_bufs = allocator_mem_bufs[allocator]; + (void)mem_bufs.insert(mem_buf); + } + for (const auto &[allocator, mem_bufs] : allocator_mem_bufs) { + ss << "\tIn used mem buf info for " << allocator->BriefInfo() << ", mem_bufs size : " << mem_bufs.size() << "\n"; + } + + size_t other_used_size = 0; + int start = static_cast(memory::mem_pool::MemType::kGraphOutput); + int end = static_cast(memory::mem_pool::MemType::kOther); + for (int i = start; i <= end; i++) { + other_used_size += mem_buf_used_stat[i]; + } + + ss << "The dynamic memory pool[" << GetMemoryPoolType() << "] stat info : " << mem_stat_.ToReadableString() + << ", actual peak used mem:" << ActualPeakStatistics() / kMBToByte + << "M. Weight used size:" << mem_buf_used_stat[static_cast(memory::mem_pool::MemType::kWeight)] / kMBToByte + << "M, constant value used size:" + << mem_buf_used_stat[static_cast(memory::mem_pool::MemType::kConstantValue)] / kMBToByte + << "M, kernel output used size:" + << mem_buf_used_stat[static_cast(memory::mem_pool::MemType::kKernel)] / kMBToByte + << "M, other used size:" << other_used_size / kMBToByte << "M.\n"; + return ss.str(); +} + +const std::pair AbstractDynamicMemPool::FreeIdleMemsByEagerFree() { + if (!IsEnableVmm() && !IsEnableEagerFree()) { + LOG_OUT << "FreeIdleMemsByEagerFree is not allowed since vmm is not enabled."; + return std::make_pair(0L, 0L); + } + + LOG_OUT << "Free idle mems by eager free start, allocator size : " << stream_id_allocators_.size() << "."; + eager_free_count_++; + + size_t total_eager_free_size = 0; + size_t total_real_free_size = 0; + for (auto &stream_id_allocator : stream_id_allocators_) { + const auto [eager_free_size, real_free_size] = stream_id_allocator.second->FreeIdleMemsByEagerFree(); + total_eager_free_size += eager_free_size; + total_real_free_size += real_free_size; + } + + size_t not_free_size = + total_eager_free_size > total_real_free_size ? (total_eager_free_size - total_real_free_size) : 0; + if (total_real_free_size >= kGBToByte) { + LOG_OUT << "Eager free count : " << eager_free_count_ << ", free memory : " << total_eager_free_size + << ", real free : " << total_real_free_size << ", not free : " << not_free_size << "."; + } else { + LOG_OUT << "Eager free count : " << eager_free_count_ << ", free memory : " << total_eager_free_size + << ", real free : " << total_real_free_size << ", not free : " << not_free_size << "."; + } + + mem_stat_.eager_free_size_ += total_eager_free_size; + return {total_eager_free_size, total_real_free_size}; +} + +size_t AbstractDynamicMemPool::ReleaseFreeBlocks() { + LOG_OUT << "Release free blocks start."; + size_t release_size = 0; + for (auto &stream_id_allocator : stream_id_allocators_) { + release_size += stream_id_allocator.second->ReleaseFreeBlocks(); + } + LOG_OUT << "Release free blocks size : " << release_size << "."; + return release_size; +} + +size_t AbstractDynamicMemPool::ReleaseCustomFreeBlocks() { + LOG_OUT << "Release custom free blocks start."; + size_t release_size = 0; + for (auto &customized_allocator : customized_allocators_) { + release_size += customized_allocator.second->ReleaseFreeBlocks(); + } + LOG_OUT << "Release custom free blocks size : " << release_size << "."; + return release_size; +} + +// The statistics information. +size_t AbstractDynamicMemPool::TotalMemStatistics() const { + if (IsEnableVmm()) { + return GetVmmUsedMemSize() + mem_stat_.custom_alloc_size_; + } + return mem_stat_.alloc_size_ + mem_stat_.custom_alloc_size_; +} + +size_t AbstractDynamicMemPool::TotalUsedMemStatistics() const { return mem_stat_.used_size_; } + +size_t AbstractDynamicMemPool::TotalUsedByEventMemStatistics() const { return mem_stat_.used_by_event_size_; } + +size_t AbstractDynamicMemPool::TotalIdleMemStatistics() const { return mem_stat_.IdleSize(); } + +size_t AbstractDynamicMemPool::TotalEagerFreeMemStatistics() const { return mem_stat_.eager_free_size_; } + +size_t AbstractDynamicMemPool::UsedMemPeakStatistics() const { return mem_stat_.peak_size_; } + +size_t AbstractDynamicMemPool::MaxMemAllocatedStatistics() const { return mem_stat_.iter_used_peak_size_; } + +size_t AbstractDynamicMemPool::MaxMemReservedStatistics() const { return mem_stat_.iter_alloc_peak_size_; } + +size_t AbstractDynamicMemPool::ActualPeakStatistics() const { + if (IsEnableVmm()) { + return GetVmmUsedMemSize() + mem_stat_.custom_alloc_size_; + } + + size_t peak_size = 0; + for (auto &stream_id_allocator : stream_id_allocators_) { + peak_size += stream_id_allocator.second->ActualPeakSize(); + } + for (auto &customized_allocator : customized_allocators_) { + peak_size += customized_allocator.second->ActualPeakSize(); + } + return peak_size; +} + +std::unordered_map AbstractDynamicMemPool::BlockCountsStatistics() const { + LockGuard lock(lock_); + size_t persistent_block_count = 0; + size_t common_block_count = 0; + for (const auto &[allocator_info, allocator_ptr] : stream_id_allocators_) { + if (allocator_info.from_persistent_mem) { + persistent_block_count += allocator_ptr->mem_blocks_.size(); + } else { + common_block_count += allocator_ptr->mem_blocks_.size(); + } + } + std::unordered_map block_counts; + block_counts[kPersistentMemPoolType] = persistent_block_count; + block_counts[kCommonMemPoolType] = common_block_count; + return block_counts; +} + +std::unordered_map AbstractDynamicMemPool::BlockUnitSizeStatistics() const { + LockGuard lock(lock_); + std::unordered_map block_units; + block_units[kPersistentMemPoolType] = persist_unit_size_; + block_units[kCommonMemPoolType] = common_unit_size_; + return block_units; +} + +std::unordered_map> +AbstractDynamicMemPool::CommonMemBlocksInfoStatistics() const { + LockGuard lock(lock_); + std::unordered_map> block_infos; + for (const auto &[allocator_info, allocator_ptr] : stream_id_allocators_) { + if (!allocator_info.from_persistent_mem) { + const auto &mem_blocks = allocator_ptr->mem_blocks_; + for (const auto mem_block : mem_blocks) { + std::unordered_map block_info; + block_info[kBlockMemorySize] = mem_block->size_; + block_info[kBlockStreamId] = mem_block->stream_id_; + block_infos[(std::string *)(mem_block->addr_)] = block_info; + } + } + } + return block_infos; +} + +std::unordered_map> +AbstractDynamicMemPool::PersistentMemBlocksInfoStatistics() const { + LockGuard lock(lock_); + std::unordered_map> block_infos; + for (const auto &[allocator_info, allocator_ptr] : stream_id_allocators_) { + if (allocator_info.from_persistent_mem) { + const auto &mem_blocks = allocator_ptr->mem_blocks_; + for (const auto mem_block : mem_blocks) { + std::unordered_map block_info; + block_info[kBlockMemorySize] = mem_block->size_; + block_info[kBlockStreamId] = mem_block->stream_id_; + block_infos[(std::string *)(mem_block->addr_)] = block_info; + } + } + } + return block_infos; +} + +void AbstractDynamicMemPool::ResetMaxMemReserved() { + LockGuard lock(lock_); + mem_stat_.iter_alloc_peak_size_ = IsEnableVmm() ? GetVmmUsedMemSize() + mem_stat_.custom_alloc_size_ + : mem_stat_.alloc_size_ + mem_stat_.custom_alloc_size_; +} + +void AbstractDynamicMemPool::ResetMaxMemAllocated() { + LockGuard lock(lock_); + mem_stat_.iter_used_peak_size_ = mem_stat_.used_size_; +} + +AbstractEnhancedDynamicMemPool::AbstractEnhancedDynamicMemPool() {} + +void AbstractEnhancedDynamicMemPool::ReportMemoryPoolInfo() { + // Report memory data to profiler. + if (memory_profiler_callback_) { + memory_profiler_callback_(); + } +} + +void AbstractEnhancedDynamicMemPool::ReportMemoryPoolMallocInfoToMstx(void *addr, size_t size) { + if (memory_malloc_mstx_callback_) { + memory_malloc_mstx_callback_(addr, size); + } +} + +void AbstractEnhancedDynamicMemPool::ReportMemoryPoolFreeInfoToMstx(void *addr) { + if (memory_free_mstx_callback_) { + memory_free_mstx_callback_(addr); + } +} + +MemoryTimeEventPtr AbstractEnhancedDynamicMemPool::GenAllocateMemoryTimeEvent(const void *addr, size_t size, + uint32_t stream_id, bool from_persistent, + bool is_persistent) { + auto time_event = std::make_shared(); + time_event->created_at_ = static_cast( + std::chrono::duration_cast(std::chrono::high_resolution_clock::now().time_since_epoch()) + .count()); + time_event->addr_ = const_cast(addr); + time_event->size_ = size; + time_event->from_persistent_ = static_cast(from_persistent); + time_event->is_persistent_ = static_cast(is_persistent); + time_event->stream_id_ = stream_id; + time_event->run_mode_ = DynamicMemAllocatorDebugInfo::GetDebugInfo().run_mode_; + time_event->used_size_ = mem_stat_.used_size_; + time_event->peak_size_ = mem_stat_.peak_size_; + time_event->alloc_size_ = TotalMemStatistics(); + time_event->used_by_event_size_ = mem_stat_.used_by_event_size_; + time_event->eager_free_size_ = mem_stat_.eager_free_size_; + time_event->owner_ = DynamicMemAllocatorDebugInfo::GetDebugInfo().name_; + time_event->alloc_type_ = static_cast(DynamicMemAllocatorDebugInfo::GetDebugInfo().type_); + return time_event; +} + +MemoryTimeEventPtr AbstractEnhancedDynamicMemPool::GenFreeMemoryTimeEvent(const void *addr) { + auto time_event = std::make_shared(); + time_event->created_at_ = static_cast( + std::chrono::duration_cast(std::chrono::high_resolution_clock::now().time_since_epoch()) + .count()); + time_event->addr_ = const_cast(addr); + const size_t time_event_free_size = -1; + time_event->size_ = time_event_free_size; + time_event->used_size_ = mem_stat_.used_size_; + time_event->peak_size_ = mem_stat_.peak_size_; + time_event->alloc_size_ = TotalMemStatistics(); + time_event->used_by_event_size_ = mem_stat_.used_by_event_size_; + time_event->eager_free_size_ = mem_stat_.eager_free_size_; + return time_event; +} +} // namespace device +} // namespace mindspore diff --git a/inferrt/src/hardware/hardware_abstract/memory/abstract_dynamic_mem_pool.h b/inferrt/src/hardware/hardware_abstract/memory/abstract_dynamic_mem_pool.h new file mode 100644 index 0000000000000000000000000000000000000000..6833d48a8a12ed23a04fdd89b8bad83dcf338ae4 --- /dev/null +++ b/inferrt/src/hardware/hardware_abstract/memory/abstract_dynamic_mem_pool.h @@ -0,0 +1,497 @@ +/** + * Copyright 2024 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_MEM_REUSE_ABSTRACT_DYNAMIC_MEM_POOL_H_ +#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_MEM_REUSE_ABSTRACT_DYNAMIC_MEM_POOL_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "hardware/hardware_abstract/memory/dynamic_mem_pool.h" +#include "hardware/hardware_abstract/visible.h" +#include "hardware/hardware_abstract/stream_util.h" + +namespace mindspore { +namespace device { +constexpr size_t kDecimalPrecision = 3; +// largest allocation size for small pool is 1 MB +constexpr size_t kSmallSize = 1048576; + +struct HARDWARE_EXPORT MemBlock; + +using MemBufStatus = DynamicMemBufStatus; +struct HARDWARE_EXPORT MemBuf : EventBase { + explicit MemBuf(size_t size, void *addr, uint32_t stream_id, MemBlock *mem_block, MemBufStatus status); + + MemBuf() = delete; + MemBuf(const MemBuf &) = delete; + MemBuf &operator=(const MemBuf &) = delete; + + ~MemBuf(); + + inline void Link(MemBuf *prev, MemBuf *next) { + if (prev != nullptr) { + prev->next_ = this; + this->prev_ = prev; + } + if (next != nullptr) { + next->prev_ = this; + this->next_ = next; + } + } + + inline void Unlink() { + if (prev_ != nullptr) { + prev_->next_ = next_; + } + if (next_ != nullptr) { + next_->prev_ = prev_; + } + prev_ = nullptr; + next_ = nullptr; + } + + inline void SetDebugInfo() { + owner_name_ = DynamicMemAllocatorDebugInfo::GetDebugInfo().name_; + alloc_type_ = DynamicMemAllocatorDebugInfo::GetDebugInfo().type_; + } + + std::string ToJson() { + JsonBuilder builder; + builder.Append("addr_", addr_); + builder.Append("size_", size_); + builder.Append("stream_id_", stream_id_); + builder.Append("status_", DynamicMemBufStatusToString(status_)); + builder.Append("owner_name_", owner_name_); + return builder.ToString(); + } + + MemBuf *prev_; + MemBuf *next_; + + size_t size_; + void *addr_; + uint32_t stream_id_; + MemBlock *mem_block_; + MemBufStatus status_; + memory::mem_pool::MemType alloc_type_{memory::mem_pool::MemType::kOther}; + std::string owner_name_; +}; + +struct MemBufComparator { + bool operator()(MemBuf *const &left, MemBuf *const &right) const { + return (left->size_ != right->size_) ? left->size_ < right->size_ : left->addr_ < right->addr_; + } +}; + +struct HARDWARE_EXPORT MemBlock { + explicit MemBlock(size_t size, void *addr, uint32_t stream_id) : size_(size), addr_(addr), stream_id_(stream_id) { + min_addr_ = nullptr; + max_addr_ = nullptr; + } + + MemBlock() = delete; + MemBlock(const MemBlock &) = delete; + MemBlock &operator=(const MemBlock &) = delete; + + ~MemBlock() = default; + + inline void UpdateBorderAddr(MemBuf *mem_buf) { + if (min_addr_ == nullptr) { + min_addr_ = mem_buf->addr_; + } else { + min_addr_ = std::min(min_addr_, mem_buf->addr_); + } + void *right_addr = static_cast(mem_buf->addr_) + mem_buf->size_; + max_addr_ = std::max(max_addr_, right_addr); + } + + inline size_t ActualPeakSize() { + if (min_addr_ == nullptr || max_addr_ == nullptr) { + return 0; + } + return static_cast(max_addr_) - static_cast(min_addr_); + } + + std::string ToJson() { + JsonBuilder builder; + builder.Append("addr_", addr_); + builder.Append("size_", size_); + builder.Append("stream_id_", stream_id_); + builder.Append("min_addr_", min_addr_); + builder.Append("max_addr_", max_addr_); + return builder.ToString(); + } + + size_t size_; + void *addr_; + uint32_t stream_id_; + + void *min_addr_; + void *max_addr_; +}; + +struct HARDWARE_EXPORT MemStat { + MemStat() { Reset(); } + + MemStat(const MemStat &) = delete; + MemStat &operator=(const MemStat &) = delete; + + void Reset() { + used_size_ = 0; + peak_size_ = 0; + alloc_size_ = 0; + custom_alloc_size_ = 0; + + used_by_event_size_ = 0; + eager_free_size_ = 0; + + iter_used_peak_size_ = 0; + iter_alloc_peak_size_ = 0; + } + + inline size_t IdleSize() const { return alloc_size_ + custom_alloc_size_ - used_size_; } + + inline void UpdatePeakSize(const bool is_enable_vmm, size_t vmm_used_mem_size) { + peak_size_ = std::max(peak_size_, used_size_); + iter_used_peak_size_ = std::max(iter_used_peak_size_, used_size_); + if (is_enable_vmm) { + iter_alloc_peak_size_ = std::max(iter_alloc_peak_size_, vmm_used_mem_size + custom_alloc_size_); + } else { + iter_alloc_peak_size_ = std::max(iter_alloc_peak_size_, alloc_size_ + custom_alloc_size_); + } + } + + std::string ToJson() const { + JsonBuilder builder; + builder.Append("used_size_", used_size_); + builder.Append("peak_size_", peak_size_); + builder.Append("alloc_size_", alloc_size_); + builder.Append("idle_size_", IdleSize()); + builder.Append("used_by_event_size_", used_by_event_size_); + builder.Append("eager_free_size_", eager_free_size_); + return builder.ToString(); + } + + std::string ToReadableString() const { + JsonBuilder builder; + builder.Append("in used mem", Format(used_size_)); + builder.Append("peak used mem", Format(peak_size_)); + builder.Append("alloc mem", Format(alloc_size_)); + builder.Append("idle mem", Format(IdleSize())); + builder.Append("used by event mem", Format(used_by_event_size_)); + builder.Append("eager free mem", Format(eager_free_size_)); + return builder.ToString(); + } + + std::string Format(size_t size) const { + auto str = std::to_string(size * 1.0 / kMBToByte); + return str.substr(0, str.find(".") + kDecimalPrecision) + "MB"; + } + + size_t used_size_; + size_t peak_size_; + size_t alloc_size_; + size_t custom_alloc_size_; + + size_t used_by_event_size_; + size_t eager_free_size_; + + size_t iter_used_peak_size_; + size_t iter_alloc_peak_size_; +}; + +struct AllocatorInfo { + uint32_t stream_id = 0; + bool from_persistent_mem = false; + bool use_small_pool = false; + + bool operator<(const AllocatorInfo &other) const { + if (stream_id != other.stream_id) { + return stream_id < other.stream_id; + } + if (from_persistent_mem != other.from_persistent_mem) { + return other.from_persistent_mem; + } + if (use_small_pool != other.use_small_pool) { + return other.use_small_pool; + } + return false; + } + + std::string ToString() const { + std::ostringstream oss; + oss << "stream id: " << stream_id << ", is persistent: " << from_persistent_mem + << ", use small pool: " << use_small_pool; + return oss.str(); + } +}; + +class AbstractDynamicMemPool; + +class HARDWARE_EXPORT MemBufAllocator { + public: + explicit MemBufAllocator(std::function mem_block_expander, + std::function mem_block_cleaner, + std::function mem_mapper, + std::function mem_eager_freer, bool enable_eager_free, + bool is_persistent, uint32_t stream_id, bool is_small, bool is_customized = false) + : mem_block_expander_(mem_block_expander), + mem_block_cleaner_(mem_block_cleaner), + mem_mapper_(mem_mapper), + mem_eager_freer_(mem_eager_freer), + stream_id_(stream_id), + enable_eager_free_(enable_eager_free), + is_persistent_(is_persistent), + is_small_(is_small), + is_customized_(is_customized) { + search_key_ = new MemBuf(0, nullptr, 0, nullptr, MemBufStatus::kMemBufIdle); + } + + MemBufAllocator() = delete; + MemBufAllocator(const MemBufAllocator &) = delete; + MemBufAllocator &operator=(const MemBufAllocator &) = delete; + + ~MemBufAllocator(); + + void Initialize(size_t size); + void ReleaseDeviceRes(); + + MemBuf *Malloc(size_t size); + MemBuf *SearchAvailableMemBuf(size_t size); + bool Free(MemBuf *mem_buf, MemBufStatus target_status = MemBufStatus::kMemBufIdle); + MemBuf *MallocExpandBlock(size_t size); + const std::pair FreeIdleMemsByEagerFree(); + + size_t ReleaseFreeBlocks(); + + size_t ActualPeakSize() const { + size_t peak_size = 0; + for (auto mem_block : mem_blocks_) { + peak_size += mem_block->ActualPeakSize(); + } + return peak_size; + } + + std::string BriefInfo() const { + std::stringstream ss; + ss << "Mem buf allocator, enable vmm : " << enable_eager_free_ << ", is persistent : " << is_persistent_ + << ", stream id : " << stream_id_ << ", is small : " << is_small_ << ", is customized : " << is_customized_ + << "."; + return ss.str(); + } + + uint32_t stream_id() const { return stream_id_; } + bool is_persistent() const { return is_persistent_; } + bool is_small() const { return is_small_; } +#ifndef ENABLE_TEST + + protected: +#endif + MemBuf *MapAndSplitMemBuf(MemBuf *candidate, size_t size); + MemBlock *ExpandBlock(size_t size); + + std::function mem_block_expander_; + std::function mem_block_cleaner_; + std::function mem_mapper_; + std::function mem_eager_freer_; + + std::list mem_blocks_; + using MemAllocator = memory::mem_pool::PooledAllocator; + std::set free_mem_bufs_; + std::set eager_free_mem_bufs_; + + private: + MemBuf *search_key_; + + uint32_t stream_id_; + bool enable_eager_free_; + bool is_persistent_; + bool is_small_; + bool is_customized_; + + friend AbstractDynamicMemPool; +}; +using MemBufAllocatorPtr = std::shared_ptr; + +using Lock = memory::mem_pool::Lock; +using LockGuard = memory::mem_pool::LockGuard; +class HARDWARE_EXPORT AbstractDynamicMemPool : virtual public DynamicMemPool { + public: + AbstractDynamicMemPool(); + ~AbstractDynamicMemPool() override = default; + + void Initialize(size_t init_size, size_t increase_size, size_t max_size) override; + + void ReleaseDeviceRes() override; + + // The main program entry of memory alloc. + DeviceMemPtr AllocTensorMem(size_t size, bool from_persistent_mem = false, bool need_recycle = false, + uint32_t stream_id = kDefaultStreamIndex) override; + + // Alloc mem buf from mem pool, return mem buf and its allocator + std::pair AllocMemBuf(size_t align_size, bool from_persistent_mem = false, + uint32_t stream_id = kDefaultStreamIndex); + + // The main program entry of continuous memory alloc. + std::vector AllocContinuousTensorMem(const std::vector &size_list, + uint32_t stream_id = kDefaultStreamIndex) override; + // The main program entry of memory free. + void FreeTensorMem(const DeviceMemPtr &device_addr) override; + bool DoFreeTensorMem(const DeviceMemPtr &device_addr) override; + // The main program entry of part memory free and part memory keep. + void FreePartTensorMems(const std::vector &free_addrs, const std::vector &keep_addrs, + const std::vector &keep_addr_sizes) override; + virtual std::vector DoFreePartTensorMems(const std::vector &free_addrs, + const std::vector &keep_addrs, + const std::vector &keep_addr_sizes); + + // Element in vector : memory_stream_id, address + bool RecordEvent(int64_t task_id_on_stream, uint32_t user_stream_id, + const std::vector> &memory_stream_addresses, + const DeviceEventPtr &event) override; + bool WaitEvent(int64_t task_id_on_stream, uint32_t user_stream_id, uint32_t memory_stream_id) override; + bool WaitEvent(int64_t task_id_on_stream, uint32_t memory_stream_id) override; + bool SyncAllEvents() override; + bool DoSyncAllEvents(); + + size_t CalMemBlockAllocSize(size_t size, bool from_persistent_mem, bool need_recycle = false) override; + void SetMemAllocUintSize(size_t common_size, size_t persist_size = kDynamicMemAllocUnitSize) override { + common_unit_size_ = common_size; + persist_unit_size_ = persist_size; + } + size_t MemAllocUnitSize(bool from_persistent_mem = false) const override { + return from_persistent_mem ? persist_unit_size_ : common_unit_size_; + } + + void DefragMemory() override; + + std::string DynamicMemPoolStateInfo() const; + + // The statistics information. + size_t TotalMemStatistics() const override; + size_t TotalUsedMemStatistics() const override; + size_t TotalUsedByEventMemStatistics() const override; + size_t TotalIdleMemStatistics() const override; + size_t TotalEagerFreeMemStatistics() const override; + size_t UsedMemPeakStatistics() const override; + size_t MaxMemAllocatedStatistics() const override; + size_t MaxMemReservedStatistics() const override; + size_t ActualPeakStatistics() const override; + std::unordered_map BlockCountsStatistics() const override; + std::unordered_map BlockUnitSizeStatistics() const override; + std::unordered_map> CommonMemBlocksInfoStatistics() + const override; + std::unordered_map> PersistentMemBlocksInfoStatistics() + const override; + void ResetMaxMemReserved() override; + void ResetMaxMemAllocated() override; + + const bool IsEnableVmm() const override { return enable_vmm_; } + + void SetEnableVmm(bool enable_vmm) override { enable_vmm_ = enable_vmm; } + + // Get method for proxy. + std::unordered_map> &addr_mem_buf_allocators() { + return addr_mem_buf_allocators_; + } + + std::unordered_map, std::set, pair_hash> &stream_pair_mem_bufs() { + return stream_pair_mem_bufs_; + } + + const std::pair FreeIdleMemsByEagerFree() override; + + size_t ReleaseFreeBlocks() override; + size_t ReleaseCustomFreeBlocks(); + + MemStat &mem_stat() { return mem_stat_; } + + Lock &lock() { return lock_; } + + protected: + void WaitPipelineHelper(); + + MemBufAllocatorPtr GenerateAllocator(const AllocatorInfo &allocator_key); + MemBufAllocator *GetMemBufAllocator(size_t size, bool from_persistent_mem, uint32_t stream_id); +#ifndef ENABLE_TEST + + protected: +#else + + public: +#endif + std::map stream_id_allocators_; + std::unordered_map> addr_mem_buf_allocators_; + std::unordered_map, std::set, pair_hash> stream_pair_mem_bufs_; + std::map customized_allocators_; + MemStat mem_stat_; + + bool enable_vmm_{false}; + bool enable_custom_allocator_{false}; + std::function custom_alloc_fn_; + std::function custom_free_fn_; + size_t common_unit_size_{kDynamicMemAllocUnitSize}; + size_t persist_unit_size_{kDynamicMemAllocUnitSize}; + + size_t eager_free_count_{0}; + size_t last_eager_free_count_{0}; + Lock lock_; + + // init_size_ is for persistent and common. + size_t init_size_{kDynamicMemAllocUnitSize}; + size_t increase_size_{kDynamicMemAllocUnitSize}; + // Not enable currently. + size_t max_size_{0}; + + bool enable_dump_memory_{false}; +}; + +class HARDWARE_EXPORT AbstractEnhancedDynamicMemPool : public AbstractDynamicMemPool { + public: + AbstractEnhancedDynamicMemPool(); + AbstractEnhancedDynamicMemPool(const AbstractEnhancedDynamicMemPool &) = delete; + AbstractEnhancedDynamicMemPool &operator=(const AbstractEnhancedDynamicMemPool &) = delete; + ~AbstractEnhancedDynamicMemPool() override = default; + + // Report memory pool stat info for enhanced processing. + virtual void ReportMemoryPoolInfo(); + // Report memory pool stat info for mstx + virtual void ReportMemoryPoolMallocInfoToMstx(void *ptr, size_t size); + virtual void ReportMemoryPoolFreeInfoToMstx(void *ptr); + bool IsEnableTimeEvent() override { return enable_time_event_; } + + void SetEnableTimeEvent(bool enable_time_event) override { enable_time_event_ = enable_time_event; } + + virtual MemoryTimeEventPtr GenAllocateMemoryTimeEvent(const void *addr, size_t size, uint32_t stream_id, + bool from_persistent, bool is_persistent); + + virtual MemoryTimeEventPtr GenFreeMemoryTimeEvent(const void *addr); + + private: + std::atomic enable_time_event_{false}; +}; +} // namespace device +} // namespace mindspore +#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_MEM_REUSE_ABSTRACT_DYNAMIC_MEM_POOL_H_ diff --git a/inferrt/src/hardware/hardware_abstract/memory/dynamic_mem_pool.cc b/inferrt/src/hardware/hardware_abstract/memory/dynamic_mem_pool.cc new file mode 100644 index 0000000000000000000000000000000000000000..b817c5db3d0e4e161c0f3b6d2f0a390c37ef38aa --- /dev/null +++ b/inferrt/src/hardware/hardware_abstract/memory/dynamic_mem_pool.cc @@ -0,0 +1,122 @@ +/** + * Copyright 2024 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "hardware/hardware_abstract/memory/dynamic_mem_pool.h" + +#include +#include +#include "common/logger.h" + +namespace mindspore { +namespace device { +static thread_local AllocatorDebugInfo debug_info_; + +AllocatorDebugInfo &DynamicMemAllocatorDebugInfo::GetDebugInfo() noexcept { return debug_info_; } + +// Set the debug info when memory alloc. +void DynamicMemAllocatorDebugInfo::SetDebugInfo(const std::string &name, memory::mem_pool::MemType type, + int input_index, int output_index, uint8_t run_mode) { + debug_info_.name_ = name; + debug_info_.type_ = type; + debug_info_.input_index_ = input_index; + debug_info_.output_index_ = output_index; + debug_info_.run_mode_ = run_mode; +} + +static const std::map kBufStatusString = { + {DynamicMemBufStatus::kMemBufIdle, "idle"}, + {DynamicMemBufStatus::kMemBufUsed, "used"}, + {DynamicMemBufStatus::kMemBufEagerFree, "eager_free"}, + {DynamicMemBufStatus::kMemBufUsedByEvent, "used_by_event"}}; + +const std::string &DynamicMemBufStatusToString(DynamicMemBufStatus status) { return kBufStatusString.at(status); } + +bool EventBase::RecordEvent(int64_t task_id_on_stream, uint32_t user_stream_id, const DeviceEventPtr &event) { + if (event == nullptr) { + LOG_ERROR << "Event is null."; + } + if (events_ == nullptr) { + events_ = std::make_shared>>>(); + } + std::shared_ptr> event_list = nullptr; + auto iter = events_->find(user_stream_id); + if (iter == events_->end()) { + event_list = std::make_shared>(); + (void)events_->emplace(user_stream_id, event_list); + } else { + event_list = iter->second; + if (event_list == nullptr) { + LOG_ERROR << "Event list is null."; + } + } + (void)event_list->emplace_back(task_id_on_stream, event); + return true; +} + +bool EventBase::WaitEvent(uint32_t task_id_on_stream, uint32_t user_stream_id) { + if (events_ == nullptr) { + return false; + } + auto iter = events_->find(user_stream_id); + if (iter == events_->end()) { + return false; + } + auto &event_list = iter->second; + if (event_list == nullptr) { + LOG_ERROR << "Event list is null."; + } + // Pop all element in list that not bigger than task_id_on_stream. + while (!event_list->empty() && event_list->front().first <= task_id_on_stream) { + event_list->pop_front(); + } + // Remove list if event list is empty. + if (event_list->empty()) { + events_->erase(iter); + } + return true; +} + +bool EventBase::IsEventNotUsed() { return events_ == nullptr ? true : events_->empty(); } + +bool EventBase::SyncAllEvents() { + if (IsEventNotUsed()) { + return false; + } + + for (auto iter = events_->begin(); iter != events_->end();) { + auto &event_list = iter->second; + if (event_list == nullptr) { + LOG_ERROR << "Event list is null."; + } + for (auto list_iter = event_list->begin(); list_iter != event_list->end();) { + auto &event = list_iter->second; + // Sync event if event is not arrived. + if (!event->QueryEvent()) { + event->SyncEvent(); + } + list_iter = event_list->erase(list_iter); + } + if (event_list->empty()) { + // list is empty, erase list in map. + iter = events_->erase(iter); + } else { + LOG_ERROR << "Event list is not empty."; + } + } + return events_->empty(); +} +} // namespace device +} // namespace mindspore diff --git a/inferrt/src/hardware/hardware_abstract/memory/dynamic_mem_pool.h b/inferrt/src/hardware/hardware_abstract/memory/dynamic_mem_pool.h new file mode 100644 index 0000000000000000000000000000000000000000..cbabe2a9ba21042c777caaa6d70b026ea54d5991 --- /dev/null +++ b/inferrt/src/hardware/hardware_abstract/memory/dynamic_mem_pool.h @@ -0,0 +1,462 @@ +/** + * Copyright 2024 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_MEM_REUSE_DYNAMIC_MEM_POOL_H_ +#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_MEM_REUSE_DYNAMIC_MEM_POOL_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "hardware/hardware_abstract/visible.h" +#include "hardware/hardware_abstract/memory/mem_pool_util.h" +#include "hardware/hardware_abstract/stream_util.h" +#include "hardware/hardware_abstract/device_event.h" +#include "common/logger.h" + +namespace mindspore { +namespace device { +constexpr int kShiftOffset = 2; +// Alloc memory aligned according to 512 bytes. +constexpr size_t kDynamicMemAlignSize = 512; +// The minimum unit size (1G) of memory block used for dynamic extend. +constexpr size_t kDynamicMemAllocUnitSize = 1024 << 20; + +const char kPersistentParamMem[] = "Persistent mem"; +const char kCommonMem[] = "Common mem"; +constexpr size_t kMBToByte = 1024 << 10; +constexpr size_t kGBToByte = 1024 << 20; +// The smallest memory request size, if it is smaller than this size, the device memory request may fail +// Set experience value to 10M +const size_t kMinimumAllocMem = 10 << 20; + +const char kBlockMemorySize[] = "block_memory_size"; +const char kBlockStreamId[] = "block_stream_id"; +const char kCommonMemPoolType[] = "common_mem_pool"; +const char kPersistentMemPoolType[] = "persistent_mem_pool"; +using MallocFuncType = void *(size_t, int, void *); +using FreeFuncType = void(void *, size_t, int, void *); + +// The status of memory buf. +enum class HARDWARE_EXPORT DynamicMemBufStatus : int { kMemBufIdle, kMemBufUsed, kMemBufEagerFree, kMemBufUsedByEvent }; +HARDWARE_EXPORT const std::string &DynamicMemBufStatusToString(DynamicMemBufStatus status); + +using DeviceMemPtr = void(*); +struct DeviceAddrCmp { + bool operator()(const DeviceMemPtr &addr1, const DeviceMemPtr &addr2) const { return addr1 < addr2; } +}; + +// The AllocatorDebugInfo wrapper which is the local thread for the dynamic memory pool. +class HARDWARE_EXPORT DynamicMemAllocatorDebugInfo; +// Memory buf is the smallest operation object of dynamic memory pool. +struct HARDWARE_EXPORT DynamicMemBuf; +using DynamicMemBufPtr = std::shared_ptr; +// Multimap key is the tensor size, for finding the idle memory buf by tensor size. +using SizeMapMemBuf = std::multimap; +// Map key is the device address, for finding the used memory buf in memory block by device address. +using DeviceAddrMapMemBuf = std::map; +// Memory block is composed of memory buf. +class HARDWARE_EXPORT DynamicMemBlock; +using DynamicMemBlockPtr = std::shared_ptr; + +struct HARDWARE_EXPORT MemStatusManager; +using MemStatusManagerPtr = std::shared_ptr; + +// Help class for unordered_map, pair has no hash method, need override it. +struct pair_hash { + template + std::size_t operator()(const std::pair ¶m) const { + size_t hash = std::hash{}(param.first); + hash <<= (sizeof(size_t) << kShiftOffset); + hash ^= std::hash{}(param.second); + return std::hash{}(hash); + } +}; + +struct HARDWARE_EXPORT MemBuf; + +// Interface of dynamic memory pool. +class HARDWARE_EXPORT DynamicMemPool { + public: + virtual ~DynamicMemPool() = default; + + // Initialize memory pool with init size, increase size and max size. + virtual void Initialize(size_t init_size, size_t increase_size, size_t max_size) {} + + // Release the real device memory. + virtual void ReleaseDeviceRes() { LOG_ERROR << "Not implemented"; } + + // The main program entry of memory alloc. + virtual DeviceMemPtr AllocTensorMem(size_t size, bool from_persistent_mem = false, bool need_recycle = false, + uint32_t stream_id = kDefaultStreamIndex) { + LOG_ERROR << "Not implemented"; + return nullptr; + } + + // The main program entry of continuous memory alloc. + virtual std::vector AllocContinuousTensorMem(const std::vector &size_list, + uint32_t stream_id = kDefaultStreamIndex) { + LOG_ERROR << "Not implemented"; + return {}; + } + // The main program entry of memory free. + virtual void FreeTensorMem(const DeviceMemPtr &device_addr) { LOG_ERROR << "Not implemented"; } + + virtual bool DoFreeTensorMem(const DeviceMemPtr &device_addr) { return false; } + + // The main program entry of part memorys free and part memorys keep. + virtual void FreePartTensorMems(const std::vector &free_addrs, + const std::vector &keep_addrs, + const std::vector &keep_addr_sizes) { + LOG_ERROR << "Not implemented"; + } + + // Help method for dynamic memory proxy. + virtual std::vector DoFreePartTensorMems(const std::vector &free_addrs, + const std::vector &keep_addrs, + const std::vector &keep_addr_sizes) { + return {}; + } + + virtual size_t EmptyCache() { return -1L; } + + virtual size_t ReleaseFreeBlocks() { return -1L; } + + // Element in vector : memory_stream_id, address + virtual bool RecordEvent(int64_t task_id_on_stream, uint32_t user_stream_id, + const std::vector> &memory_stream_addresses, + const DeviceEventPtr &event) { + return false; + } + + virtual bool WaitEvent(int64_t task_id_on_stream, uint32_t user_stream_id, uint32_t memory_stream_id) { + return false; + } + + virtual bool WaitEvent(int64_t task_id_on_stream, uint32_t memory_stream_id) { return false; } + + virtual bool SyncAllEvents() { return false; } + + // The real size by memory alloc aligned. + virtual size_t AlignMemorySize(size_t size) const { + if (size == 0) { + return kDynamicMemAlignSize; + } + return ((size + kDynamicMemAlignSize - 1) / kDynamicMemAlignSize) * kDynamicMemAlignSize; + } + + // Calculate memory block required alloc size when adding the memory block. + virtual size_t CalMemBlockAllocSize(size_t size, bool from_persistent_mem, bool need_recycle = false) { + return kDynamicMemAllocUnitSize; + } + + // Set mem pool block size + virtual void SetMemPoolBlockSize(size_t available_device_mem_size) {} + + // Get the minimum memory unit size using for dynamic extend. + virtual size_t MemAllocUnitSize(bool from_persistent_mem) const { return kDynamicMemAllocUnitSize; } + + virtual void SetMemAllocUintSize(size_t common_size, size_t persist_size = kDynamicMemAllocUnitSize) {} + + virtual void *GetMinUsingMemoryAddr() const { return nullptr; } + + // The related interface of device memory real operation, needs override by device type. + virtual size_t AllocDeviceMem(size_t size, DeviceMemPtr *addr) { + LOG_ERROR << "Not implemented"; + return 0; + } + + virtual bool FreeDeviceMem(const DeviceMemPtr &addr) { + LOG_ERROR << "Not implemented"; + return false; + } + + virtual size_t free_mem_size() { return 0; } + + virtual uint64_t total_mem_size() const { return 0; } + + virtual size_t GetMaxUsedMemSize() const { return 0; } + + virtual size_t GetVmmUsedMemSize() const { return 0; } + + // The related interface of device memory eager free. + virtual void DefragMemory() {} + + // Display the brief state information of memory block and memory buf. + virtual void DumpDynamicMemPoolStateInfo() {} + + // Display the detailed debug information of memory block and memory buf. + virtual void DumpDynamicMemPoolDebugInfo() {} + + // The statistics information. + virtual size_t TotalMemStatistics() const { + LOG_ERROR << "Not implemented"; + return 0; + } + + virtual size_t TotalUsedMemStatistics() const { + LOG_ERROR << "Not implemented"; + return 0; + } + + virtual size_t TotalUsedByEventMemStatistics() const { + LOG_ERROR << "Not implemented"; + return 0; + } + + virtual size_t TotalIdleMemStatistics() const { + LOG_ERROR << "Not implemented"; + return 0; + } + + virtual size_t TotalEagerFreeMemStatistics() const { + LOG_ERROR << "Not implemented"; + return 0; + } + + virtual size_t UsedMemPeakStatistics() const { + LOG_ERROR << "Not implemented"; + return 0; + } + + virtual size_t MaxMemAllocatedStatistics() const { + LOG_ERROR << "Not implemented"; + return 0; + } + + virtual size_t MaxMemReservedStatistics() const { + LOG_ERROR << "Not implemented"; + return 0; + } + + virtual size_t ActualPeakStatistics() const { + LOG_ERROR << "Not implemented"; + return 0; + } + + virtual std::unordered_map BlockCountsStatistics() const { + LOG_ERROR << "Not implemented"; + return {}; + } + + virtual std::unordered_map BlockUnitSizeStatistics() const { + LOG_ERROR << "Not implemented"; + return {}; + } + + virtual std::unordered_map> + CommonMemBlocksInfoStatistics() const { + LOG_ERROR << "Not implemented"; + return {}; + } + + virtual std::unordered_map> + PersistentMemBlocksInfoStatistics() const { + LOG_ERROR << "Not implemented"; + return {}; + } + + virtual void ResetMaxMemReserved() { LOG_ERROR << "Not implemented"; } + + virtual void ResetMaxMemAllocated() { LOG_ERROR << "Not implemented"; } + + virtual std::string GetMemoryPoolType() const { return "Other"; } + + virtual const bool IsEnableEagerFree() const { return false; } + + virtual const bool IsEnableVmm() const { return false; } + + virtual void SetEnableVmm(bool enable_vmm) {} + + virtual const bool SyncAllStreams() { return false; } + + virtual size_t AllocDeviceMemByEagerFree(size_t size, DeviceMemPtr *addr) { return 0; } + + virtual size_t FreeDeviceMemByEagerFree(const DeviceMemPtr addr, const size_t size) { return 0; } + + virtual size_t MmapDeviceMem(size_t size, DeviceMemPtr addr) { return 0; } + + virtual const std::pair FreeIdleMemsByEagerFree() { return {0, 0}; } + + virtual bool IsEnableTimeEvent() { return false; } + + virtual void SetEnableTimeEvent(bool enable_time_event) {} + + virtual void EnablePluggableAllocator(std::function alloc_fn, std::function free_fn) {} + + virtual void DisablePluggableAllocator() {} + + // Use set method to avoid performance decrease. + void SetMemoryProfilerCallback(const std::function &memory_profiler_callback) { + memory_profiler_callback_ = memory_profiler_callback; + } + + void SetMemoryMstxCallback(const std::function memory_malloc_mstx_callback, + const std::function memory_free_mstx_callback) { + memory_malloc_mstx_callback_ = memory_malloc_mstx_callback; + memory_free_mstx_callback_ = memory_free_mstx_callback; + } + + // Set rank id getter for memory pool to generate dump path. + virtual void SetRankIdGetter(const std::function &rank_id_getter) { + if (rank_id_getter != nullptr) { + rank_id_getter_ = rank_id_getter; + } + } + + void SetPipelineCallback(const std::function &pipeline_callback) { pipeline_callback_ = pipeline_callback; } + + protected: + std::function memory_profiler_callback_{nullptr}; + std::function rank_id_getter_ = []() { return SIZE_MAX; }; + std::function pipeline_callback_{nullptr}; + std::function memory_malloc_mstx_callback_{nullptr}; + std::function memory_free_mstx_callback_{nullptr}; +}; + +// Recording information for debugging the memory allocator. +struct HARDWARE_EXPORT AllocatorDebugInfo { + std::string name_{"Unknown"}; + memory::mem_pool::MemType type_{memory::mem_pool::MemType::kOther}; + int input_index_{-1}; + int output_index_{-1}; + uint8_t run_mode_{0}; +}; + +class HARDWARE_EXPORT DynamicMemAllocatorDebugInfo { + public: + static AllocatorDebugInfo &GetDebugInfo() noexcept; + + // Set the debug info when memory alloc. + static void SetDebugInfo(const std::string &name, memory::mem_pool::MemType type, int input_index = -1, + int output_index = -1, uint8_t run_mode = 0); + + private: + DynamicMemAllocatorDebugInfo() = default; + virtual ~DynamicMemAllocatorDebugInfo() = default; + DynamicMemAllocatorDebugInfo(const DynamicMemAllocatorDebugInfo &) = delete; + DynamicMemAllocatorDebugInfo &operator=(const DynamicMemAllocatorDebugInfo &) = delete; +}; + +using TaskIdOnStreamEvent = std::pair; +struct HARDWARE_EXPORT EventBase { + // Record event on mem buf. + bool RecordEvent(int64_t task_id_on_stream, uint32_t user_stream_id, const DeviceEventPtr &event); + + // Release events on mem buf. + bool WaitEvent(uint32_t task_id_on_stream, uint32_t user_stream_id); + + // Indicates if mem buf used by event, return true when no event bind on mem buf. + bool IsEventNotUsed(); + + // Sync all events that bound on mem buf. + bool SyncAllEvents(); + + // Parameter: user_stream_id, list of . + std::shared_ptr>>> events_{nullptr}; +}; + +struct HARDWARE_EXPORT JsonBuilder { + JsonBuilder() { buffer_ << "{"; } + + template + void Append(std::string key, T value) { + buffer_ << "\"" << key << "\":" << value << ","; + } + + std::string ToString() { + buffer_.seekp(-1, buffer_.cur); + buffer_ << "}"; + return buffer_.str(); + } + + std::stringstream buffer_; +}; + +struct HARDWARE_EXPORT MemoryTimeEvent { + // Creation time of address in ns. + uint64_t created_at_{0}; + + // Device address. + void *addr_{nullptr}; + + // Size of memory allocation. + size_t size_{0}; + + // Used size of memory pool. + size_t used_size_{0}; + + // Peak size of memory pool. + size_t peak_size_{0}; + + // Allocate size of memory pool. + size_t alloc_size_{0}; + + // Memory size that referred by event. + size_t used_by_event_size_{0}; + + // Eager free memory size. + size_t eager_free_size_{0}; + + // Whether allocation from persistent memory. + uint8_t from_persistent_{false}; + + // Whether allocated memory is persistent. + uint8_t is_persistent_{false}; + + // pynative or graph or ge. + uint8_t run_mode_{0}; + + // Data type of this address. + uint8_t alloc_type_; + + // Stream id of address. + uint32_t stream_id_{0}; + + // Owner of this address. + std::string owner_; + + std::string ToJson() { + JsonBuilder builder; + builder.Append("created_at_", created_at_); + builder.Append("addr_", addr_); + builder.Append("size_", size_); + builder.Append("from_persistent_", from_persistent_); + builder.Append("stream_id_", stream_id_); + builder.Append("run_mode_", run_mode_); + builder.Append("used_size_", used_size_); + builder.Append("peak_size_", peak_size_); + builder.Append("alloc_size_", alloc_size_); + builder.Append("used_by_event_size_", used_by_event_size_); + builder.Append("eager_free_size_", eager_free_size_); + builder.Append("owner_", owner_); + builder.Append("alloc_type_", alloc_type_); + return builder.ToString(); + } +}; +using MemoryTimeEventPtr = std::shared_ptr; +} // namespace device +} // namespace mindspore +#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_MEM_REUSE_DYNAMIC_MEM_POOL_H_ diff --git a/inferrt/src/hardware/hardware_abstract/memory/mem_pool_util.cc b/inferrt/src/hardware/hardware_abstract/memory/mem_pool_util.cc new file mode 100644 index 0000000000000000000000000000000000000000..23273509b04bdee8cb6a9760ad897aa6e28148fc --- /dev/null +++ b/inferrt/src/hardware/hardware_abstract/memory/mem_pool_util.cc @@ -0,0 +1,40 @@ +/** + * Copyright 2025 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "hardware/hardware_abstract/memory/mem_pool_util.h" +#include + +namespace mindspore { +namespace memory { +namespace mem_pool { +const std::map kMemTypeStr = {{MemType::kWeight, "Weight"}, + {MemType::kConstantValue, "ConstantValue"}, + {MemType::kKernel, "Kernel"}, + {MemType::kGraphOutput, "GraphOutput"}, + {MemType::kSomas, "Somas"}, + {MemType::kSomasOutput, "SomasOutput"}, + {MemType::kGeConst, "GeConst"}, + {MemType::kGeFixed, "GeFixed"}, + {MemType::kBatchMemory, "BatchMemory"}, + {MemType::kContinuousMemory, "ContinuousMemory"}, + {MemType::kPyNativeInput, "PyNativeInput"}, + {MemType::kPyNativeOutput, "PyNativeOutput"}, + {MemType::kWorkSpace, "WorkSpace"}, + {MemType::kOther, "Other"}}; + +std::string MemTypeToStr(MemType mem_type) { return kMemTypeStr.at(mem_type); } +} // namespace mem_pool +} // namespace memory +} // namespace mindspore diff --git a/inferrt/src/hardware/hardware_abstract/memory/mem_pool_util.h b/inferrt/src/hardware/hardware_abstract/memory/mem_pool_util.h new file mode 100644 index 0000000000000000000000000000000000000000..0530008be02020e3af4ec65bed63c22bc2f547a6 --- /dev/null +++ b/inferrt/src/hardware/hardware_abstract/memory/mem_pool_util.h @@ -0,0 +1,168 @@ +/** + * Copyright 2025 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_MEMORY_MEM_POOL_MEM_POOL_UTIL_H_ +#define MINDSPORE_CCSRC_MEMORY_MEM_POOL_MEM_POOL_UTIL_H_ + +#include +#include + +#include "hardware/hardware_abstract/visible.h" + +namespace mindspore { +namespace memory { +namespace mem_pool { +enum class MemType : int { + kWeight = 0, + kConstantValue, + kKernel, + kGraphOutput, + kSomas, + kSomasOutput, + kGeConst, + kGeFixed, + kBatchMemory, + kContinuousMemory, + kPyNativeInput = 10, + kPyNativeOutput, + kWorkSpace, + kOther +}; + +class HARDWARE_EXPORT Lock { + public: + inline void lock() { + while (locked.test_and_set(std::memory_order_acquire)) { + } + } + inline void unlock() { locked.clear(std::memory_order_release); } + + protected: + std::atomic_flag locked = ATOMIC_FLAG_INIT; +}; + +class HARDWARE_EXPORT LockGuard { + public: + explicit LockGuard(const Lock &lock) : lock_(const_cast(&lock)) { lock_->lock(); } + ~LockGuard() { lock_->unlock(); } + + private: + Lock *lock_; +}; + +HARDWARE_EXPORT std::string MemTypeToStr(MemType mem_type); + +constexpr size_t kPoolGrowSize = 1 << 20; + +template +class ObjectPool { + struct Buf { + Buf *next_; + }; + + class Buffer { + static const std::size_t bucket_size = sizeof(T) > sizeof(Buf) ? sizeof(T) : sizeof(Buf); + static const std::size_t kDataBucketSize = bucket_size * kPoolGrowSize; + + public: + explicit Buffer(Buffer *next) : next_(next) {} + + T *GetBlock(std::size_t index) { + if (index >= kPoolGrowSize) { + throw std::bad_alloc(); + } + return reinterpret_cast(&data_[bucket_size * index]); + } + + Buffer *const next_; + + private: + uint8_t data_[kDataBucketSize]; + }; + + Buf *free_list_ = nullptr; + Buffer *buffer_head_ = nullptr; + std::size_t buffer_index_ = kPoolGrowSize; + + public: + ObjectPool() = default; + ObjectPool(ObjectPool &&object_pool) = delete; + ObjectPool(const ObjectPool &object_pool) = delete; + ObjectPool operator=(const ObjectPool &object_pool) = delete; + ObjectPool operator=(ObjectPool &&object_pool) = delete; + + ~ObjectPool() { + while (buffer_head_ != nullptr) { + Buffer *buffer = buffer_head_; + buffer_head_ = buffer->next_; + delete buffer; + } + } + + T *Borrow() { + if (free_list_ != nullptr) { + Buf *buf = free_list_; + free_list_ = buf->next_; + return reinterpret_cast(buf); + } + + if (buffer_index_ >= kPoolGrowSize) { + buffer_head_ = new Buffer(buffer_head_); + buffer_index_ = 0; + } + + return buffer_head_->GetBlock(buffer_index_++); + } + + void Return(T *obj) { + Buf *buf = reinterpret_cast(obj); + buf->next_ = free_list_; + free_list_ = buf; + } +}; + +// Not support older windows version. +template +class PooledAllocator : private ObjectPool { + public: + typedef std::size_t size_type; + typedef std::ptrdiff_t difference_type; + typedef T *pointer; + typedef const T *const_pointer; + typedef T &reference; + typedef const T &const_reference; + typedef T value_type; + + template + struct rebind { + typedef PooledAllocator other; + }; + + pointer allocate(size_type n, const void *hint = 0) { + if (n != 1 || hint) throw std::bad_alloc(); + return ObjectPool::Borrow(); + } + + void deallocate(pointer p, size_type n) { ObjectPool::Return(p); } + + void construct(pointer p, const_reference val) { new (p) T(val); } + + void destroy(pointer p) { p->~T(); } +}; +} // namespace mem_pool +} // namespace memory +} // namespace mindspore +#endif diff --git a/inferrt/src/hardware/hardware_abstract/memory_manager.cc b/inferrt/src/hardware/hardware_abstract/memory_manager.cc new file mode 100644 index 0000000000000000000000000000000000000000..0952c8964168ecd9e367e46393ecd6476feab03a --- /dev/null +++ b/inferrt/src/hardware/hardware_abstract/memory_manager.cc @@ -0,0 +1,65 @@ +/** + * Copyright 2019 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "hardware/hardware_abstract/memory_manager.h" +#include +#include "common/common.h" + +namespace mindspore { +namespace device { +constexpr size_t kAlignBytes = 32; + +size_t MemoryManager::GetCommonAlignSize(size_t input_size) { + return ((input_size + kMemAlignSize + kAlignBytes - 1) / kMemAlignSize) * kMemAlignSize; +} + +size_t MemoryManager::GetCommunicationAlignSize(size_t input_size) { + return ((input_size + kMemAlignSize - 1) / kMemAlignSize) * kMemAlignSize + kTwiceMemAlignSize; +} + +void MemoryManager::FreeMemFromMemPool(void *device_ptr) { + if (device_ptr == nullptr) { + LOG_ERROR << "FreeMemFromMemPool device_ptr is null."; + } +} + +uint8_t *MemoryManager::MallocWorkSpaceMem(size_t size) { return MallocDynamicMem(size, false); } + +uint8_t *MemoryManager::MallocDynamicMem(size_t size, bool communication_mem) { + LOG_OUT << "Call default dynamic malloc " << size << " v " << communication_mem; + return nullptr; +} + +void *MemoryManager::MallocMemFromMemPool(size_t size, bool from_persistent_mem, bool, uint32_t stream_id) { + if (size == 0) { + LOG_ERROR << "MallocMemFromMemPool size is 0."; + } + return nullptr; +} + +std::vector MemoryManager::MallocContinuousMemFromMemPool(const std::vector &size_list, + uint32_t stream_id) { + if (size_list.empty()) { + LOG_ERROR << "MallocContinuousMemFromMemPool size list's size is 0."; + } + std::vector device_ptr_list; + for (size_t i = 0; i < size_list.size(); ++i) { + (void)device_ptr_list.emplace_back(nullptr); + } + return device_ptr_list; +} +} // namespace device +} // namespace mindspore diff --git a/inferrt/src/hardware/hardware_abstract/memory_manager.h b/inferrt/src/hardware/hardware_abstract/memory_manager.h new file mode 100644 index 0000000000000000000000000000000000000000..49011b321b9d82b7289033b158717b77924c8096 --- /dev/null +++ b/inferrt/src/hardware/hardware_abstract/memory_manager.h @@ -0,0 +1,129 @@ +/** + * Copyright 2019 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_RUNTIME_DEVICE_MEMORY_MANAGER_H_ +#define MINDSPORE_CCSRC_RUNTIME_DEVICE_MEMORY_MANAGER_H_ +#include +#include +#include +#include +#include +#include +#include +#include "common/logger.h" +#include "hardware/hardware_abstract/memory/dynamic_mem_pool.h" +#include "hardware/hardware_abstract/visible.h" + +namespace mindspore { +namespace device { +enum class MemType { kStaticMem, kDynamicMem, kSomasReuseDynamicMem }; +const uint32_t kInvalidGraphId = UINT32_MAX; +constexpr int kGetAllOuts = -1; +constexpr uint64_t kMemAlignSize = 512; +constexpr uint64_t kTwiceMemAlignSize = kMemAlignSize << 1; +class HARDWARE_EXPORT MemoryManager { + public: + MemoryManager() = default; + virtual ~MemoryManager() = default; + + virtual void Initialize() = 0; + virtual void Finalize() = 0; + virtual void ResetDynamicMemory() {} + virtual void ClearGlobalIdleMem() {} + + uint8_t *MallocWorkSpaceMem(size_t size); + virtual void *MallocMemFromMemPool(size_t size, bool from_persistent_mem, bool need_recycle = false, + uint32_t stream_id = kDefaultStreamIndex); + virtual size_t GetMaxUsedMemorySize() const { return 0; } + virtual void FreeMemFromMemPool(void *device_ptr); + virtual std::vector MallocContinuousMemFromMemPool(const std::vector &size_list, + uint32_t stream_id = kDefaultStreamIndex); + + static size_t GetCommonAlignSize(size_t input_size); + static size_t GetCommunicationAlignSize(size_t input_size); + + virtual size_t GetAvailableMemSize() { + LOG_ERROR << "Return default 0 mem size!"; + return 0; + } + + bool RecordEvent(int64_t task_id_on_stream, uint32_t user_stream_id, + const std::vector> &memory_stream_addresses, + const DeviceEventPtr &event) { + if (GetMemoryPool() == nullptr) { + LOG_OUT << "memory pool is nullptr."; + return false; + } + return GetMemoryPool()->RecordEvent(task_id_on_stream, user_stream_id, memory_stream_addresses, event); + } + bool WaitEvent(int64_t task_id_on_stream, uint32_t user_stream_id, uint32_t memory_stream_id) { + if (GetMemoryPool() == nullptr) { + LOG_OUT << "memory pool is nullptr."; + return false; + } + return GetMemoryPool()->WaitEvent(task_id_on_stream, user_stream_id, memory_stream_id); + } + bool WaitEvent(int64_t task_id_on_stream, uint32_t memory_stream_id) { + if (GetMemoryPool() == nullptr) { + LOG_OUT << "memory pool is nullptr."; + return false; + } + return GetMemoryPool()->WaitEvent(task_id_on_stream, memory_stream_id); + } + bool SyncAllEvents() { + if (GetMemoryPool() == nullptr) { + LOG_OUT << "memory pool is nullptr."; + return false; + } + return GetMemoryPool()->SyncAllEvents(); + } + + virtual DynamicMemPool *GetMemoryPool() = 0; + + // Relevant function to manage memory statistics + virtual size_t GetTotalMemStatistics() const { return 0; } + virtual size_t GetTotalUsedMemStatistics() const { return 0; } + virtual size_t GetTotalIdleMemStatistics() const { return 0; } + virtual size_t GetTotalEagerFreeMemStatistics() const { return 0; } + virtual size_t GetUsedMemPeakStatistics() const { return 0; } + virtual size_t GetReservedMemPeakStatistics() const { return 0; } + virtual std::unordered_map GetBlockCountsStatistics() const { return {}; } + virtual std::unordered_map GetBlockUnitSizeStatistics() const { return {}; } + virtual std::unordered_map> + GetCommonMemBlocksInfoStatistics() const { + return {}; + } + virtual std::unordered_map> + GetPersistentMemBlocksInfoStatistics() const { + return {}; + } + virtual void ResetMaxMemoryReserved() {} + virtual void ResetMaxMemoryAllocated() {} + virtual size_t EmptyCache() { return -1L; } + + protected: + virtual uint8_t *MallocStaticMem(size_t size, bool communication_mem, uint32_t graph_id) = 0; + virtual uint8_t *MallocStaticMem(size_t size, bool communication_mem) { + return MallocStaticMem(size, communication_mem, kInvalidGraphId); + } + virtual uint8_t *MallocDynamicMem(size_t size, bool communication_mem); + + // Hold memory pool for common operations on memory. + DynamicMemPool *memory_pool_{nullptr}; +}; +} // namespace device +} // namespace mindspore +#endif // MINDSPORE_CCSRC_RUNTIME_DEVICE_MEMORY_MANAGER_H_ diff --git a/inferrt/src/hardware/hardware_abstract/multi_stream_controller.cc b/inferrt/src/hardware/hardware_abstract/multi_stream_controller.cc new file mode 100644 index 0000000000000000000000000000000000000000..a849a9655430352e4764e671920137ccd57b3664 --- /dev/null +++ b/inferrt/src/hardware/hardware_abstract/multi_stream_controller.cc @@ -0,0 +1,339 @@ +/** + * Copyright 2024-2025 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "hardware/hardware_abstract/multi_stream_controller.h" + +#include +#include +#include "common/logger.h" + +namespace mindspore { +namespace device { +constexpr size_t kDefaultStreamRefreshSize = 2; + +namespace { +template +struct AtomicWrapper { + AtomicWrapper() : value_(0L) {} + explicit AtomicWrapper(const std::atomic &value) : value_(value.load()) {} + AtomicWrapper(const AtomicWrapper &other) : value_(other.value_.load()) {} + AtomicWrapper &operator=(const AtomicWrapper &other) { + if (this == &other) { + return *this; + } + value_.store(other.value_.load()); + return *this; + } + + std::atomic value_; +}; + +class LockGuard { + public: + explicit LockGuard(SpinLock &lock) : spin_lock_(lock) { spin_lock_.lock(); } + ~LockGuard() { spin_lock_.unlock(); } + + private: + SpinLock &spin_lock_; +}; +} // namespace + +class TaskIdOnStreamManager { + public: + TaskIdOnStreamManager() = default; + + void Resize(uint32_t stream_size) { + if (initialized_ && stream_size <= initialize_size_) { + LOG_OUT << "Task id on stream manager has already initialized, current size : " << initialize_size_ << "."; + return; + } + LOG_OUT << "Task id on stream manager initialize : " << initialized_ << ", stream_size : " << stream_size << "."; + uint32_t min_stream_size = 2; + initialize_size_ = std::max(stream_size, min_stream_size); + generator_.resize(initialize_size_); + status_.resize(initialize_size_); + for (auto &vec : status_) { + vec.resize(initialize_size_); + } + initialized_ = true; + } + + inline int64_t Query(uint32_t user_stream_id, uint32_t memory_stream_id) { + return status_[user_stream_id][memory_stream_id]; + } + + inline bool Update(int64_t task_id_on_stream, uint32_t user_stream_id, uint32_t memory_stream_id) { + if (status_[user_stream_id][memory_stream_id] >= task_id_on_stream) { + return false; + } + status_[user_stream_id][memory_stream_id] = task_id_on_stream; + return true; + } + + inline int64_t Launch(uint32_t stream_id) { + if (stream_id >= generator_.size()) { + LOG_OUT << "Launch stream id : " << stream_id << " failed, generator_ size : " << generator_.size(); + generator_.resize(stream_id + 1); + status_.resize(stream_id + 1); + for (auto &vec : status_) { + vec.resize(stream_id + 1); + } + } + return ++generator_[stream_id].value_; + } + + inline int64_t Get(uint32_t stream_id) { return generator_[stream_id].value_; } + + private: + bool initialized_{false}; + uint32_t initialize_size_{0}; + std::vector> generator_; + std::vector> status_; +}; + +// Event pool recycled with ref count, pool will reuse event when cannot create more events. +class EventPool { + public: + explicit EventPool(std::function event_creator) : event_creator_(std::move(event_creator)) {} + ~EventPool() { + LockGuard lock(lock_); + expired_ = true; + events_.clear(); + cached_events_.clear(); + } + + EventPool() = delete; + EventPool(const EventPool &) = delete; + EventPool &operator=(const EventPool &) = delete; + + // Get event from pool, event was wrapper by shared_ptr. + DeviceEventPtr Get() { + LOG_OUT << "Event pool get start."; + LockGuard lock(lock_); + DeviceEvent *event = nullptr; + // Try to create event firstly before reached core size. + if (size_ < core_size_) { + auto created_event = event_creator_(); + if (created_event != nullptr && created_event->IsReady()) { + cached_events_.push_back(created_event); + size_++; + event = created_event.get(); + } else { + core_size_ = size_; + } + } + // Try to reuse event. + if (event == nullptr) { + auto iter = events_.begin(); + while (iter != events_.end()) { + auto event_in_list = *iter; + if (event_in_list == nullptr) { + LOG_ERROR << "exception : event in list is nullptr, events_ size : " << events_.size() << "."; + } + if (event_in_list->QueryEvent()) { + event = event_in_list; + events_.erase(iter); + break; + } + iter++; + } + } + // Reuse failed, try to create more event. + if (event == nullptr) { + auto created_event = event_creator_(); + if (created_event != nullptr && created_event->IsReady()) { + cached_events_.push_back(created_event); + event = created_event.get(); + size_++; + } else { + LOG_ERROR << "Get event failed."; + } + } + LOG_OUT << "Get event, events_ size : " << events_.size() << ", event : " << event << "."; + + auto event_ptr = std::shared_ptr(event, [&](DeviceEvent *e) { + LockGuard lock(lock_); + if (!expired_) { + LOG_OUT << "Return event : " << e << "."; + events_.push_back(e); + } else { + LOG_OUT << "Return event : " << e << "failed."; + } + }); + return event_ptr; + } + + private: + SpinLock lock_; + bool expired_{false}; + // Pool will just create event before reach core size, use half of size limits as core size. + size_t core_size_{32768}; + size_t size_{0}; + std::function event_creator_; + std::list events_; + // cached_events_ hold shared ptr of event, since device res manager return a smart pointer. + std::list cached_events_; +}; +using EventPoolPtr = std::shared_ptr; + +MultiStreamController::MultiStreamController(DeviceResManager *device_res_base) : device_res_base_(device_res_base) { + if (device_res_base_ == nullptr) { + LOG_ERROR << "device_res_base_ is nullptr."; + } + task_id_on_stream_manager_ = std::make_shared(); +} + +void MultiStreamController::Refresh() { + LockGuard lock(lock_); + auto stream_size = device_res_base_->QueryStreamSize(); + LOG_OUT << "Stream manager initialize, stream_size : " << stream_size << "."; + if (stream_size == 0) { + // CPU has no concept of stream, stream size must be zero. + LOG_OUT << "Stream size is 0, will initialize with 2 streams."; + stream_size = kDefaultStreamRefreshSize; + } + task_id_on_stream_manager_->Resize(stream_size); + if (event_pool_ == nullptr) { + event_pool_ = std::make_shared([&]() { + // Event in pool need to do synchronization between streams, need to enable blocking. + return device_res_base_->CreateRuntimeEvent(true, false); + }); + } +} + +bool MultiStreamController::UpdateTaskIdOnStream(int64_t task_id_on_stream, uint32_t user_stream_id, + uint32_t memory_stream_id) { + LockGuard lock(lock_); + return task_id_on_stream_manager_->Update(task_id_on_stream, user_stream_id, memory_stream_id); +} + +int64_t MultiStreamController::QueryTaskIdOnStream(uint32_t user_stream_id, uint32_t memory_stream_id) { + LockGuard lock(lock_); + return task_id_on_stream_manager_->Query(user_stream_id, memory_stream_id); +} + +int64_t MultiStreamController::LaunchTaskIdOnStream(uint32_t stream_id) { + LockGuard lock(lock_); + return task_id_on_stream_manager_->Launch(stream_id); +} + +int64_t MultiStreamController::GetTaskIdOnStream(uint32_t stream_id) { + LockGuard lock(lock_); + return task_id_on_stream_manager_->Get(stream_id); +} + +std::mutex &MultiStreamController::GetStreamMutex(size_t stream_id) { + LockGuard lock(lock_); + return stream_mutexes_[stream_id]; +} + +bool MultiStreamController::RecordEvent(int64_t task_id_on_stream, uint32_t user_stream_id, + const std::vector> &memory_stream_addresses, + const DeviceEventPtr &input_event) { + LockGuard lock(lock_); + DeviceEventPtr event = nullptr; + if (input_event != nullptr) { + event = input_event; + } else { + event = device_res_base_->CreateRuntimeEvent(false, true); + if (event == nullptr) { + return true; + } + event->RecordEvent(user_stream_id); + } + + return device_res_base_->RecordEvent(task_id_on_stream, user_stream_id, memory_stream_addresses, event); +} + +bool MultiStreamController::WaitEvent(int64_t task_id_on_stream, uint32_t user_stream_id, uint32_t memory_stream_id) { + LockGuard lock(lock_); + // If update task id on stream failed, means task id on stream is elder one, no need to wait event on mem manager. + if (!task_id_on_stream_manager_->Update(task_id_on_stream, user_stream_id, memory_stream_id)) { + LOG_OUT << "Skip Wait Event."; + return false; + } + return device_res_base_->WaitEvent(task_id_on_stream, user_stream_id, memory_stream_id); +} + +bool MultiStreamController::WaitEvent(int64_t task_id_on_stream, uint32_t user_stream_id) { + LockGuard lock(lock_); + return device_res_base_->WaitEvent(task_id_on_stream, user_stream_id); +} + +bool MultiStreamController::DispatchRecordWaitEvent(uint32_t user_stream_id, uint32_t memory_stream_id) { + LockGuard lock(lock_); + if (event_pool_ == nullptr) { + LOG_OUT << "Event pool is not initialized."; + event_pool_ = std::make_shared([&]() { + // Event in pool need to do synchronization between streams, need to enable blocking. + return device_res_base_->CreateRuntimeEvent(true, false); + }); + } + auto event = event_pool_->Get(); + // Note : record event on memory stream id and wait event on user stream id to make sure memory is safe. + event->RecordEvent(memory_stream_id); + event->WaitEvent(user_stream_id); + return true; +} + +bool MultiStreamController::SyncStream(size_t stream_id) { + LockGuard lock(lock_); + bool ret = device_res_base_->SyncStream(stream_id); + auto task_id_on_stream = task_id_on_stream_manager_->Get(stream_id); + device_res_base_->WaitEvent(task_id_on_stream, stream_id); + return ret; +} + +bool MultiStreamController::SyncAllStreams() { + LockGuard lock(lock_); + bool ret = device_res_base_->SyncAllStreams(); + device_res_base_->SyncAllEvents(); + return ret; +} + +bool MultiStreamController::SyncNotDefaultStreams() { + LockGuard lock(lock_); + bool ret = device_res_base_->SyncNotDefaultStreams(); + const auto &stream_ids = device_res_base_->GetStreamIds(); + for (auto stream_id : stream_ids) { + auto task_id_on_stream = task_id_on_stream_manager_->Get(stream_id); + device_res_base_->WaitEvent(task_id_on_stream, stream_id); + } + return ret; +} + +bool MultiStreamController::WaitMultiStream(size_t wait_stream_id) { + LockGuard lock(lock_); + LOG_OUT << "Wait multi stream on wait stream id : " << wait_stream_id << "."; + const auto &stream_ids = device_res_base_->GetStreamIds(); + if (event_pool_ == nullptr) { + LOG_OUT << "Event pool is not initialized."; + event_pool_ = std::make_shared([&]() { + // Event in pool need to do synchronization between streams, need to enable blocking. + return device_res_base_->CreateRuntimeEvent(true, false); + }); + } + device_res_base_->BindDeviceToCurrentThread(true); + auto event = event_pool_->Get(); + for (auto stream_id : stream_ids) { + if (stream_id != wait_stream_id) { + event->RecordEvent(stream_id); + event->WaitEvent(wait_stream_id); + } + } + return true; +} +} // namespace device +} // namespace mindspore diff --git a/inferrt/src/hardware/hardware_abstract/multi_stream_controller.h b/inferrt/src/hardware/hardware_abstract/multi_stream_controller.h new file mode 100644 index 0000000000000000000000000000000000000000..443020e28edb362dd2e271e742116cea989668f9 --- /dev/null +++ b/inferrt/src/hardware/hardware_abstract/multi_stream_controller.h @@ -0,0 +1,100 @@ +/** + * Copyright 2024-2025 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_CCSRC_RUNTIME_DEVICE_MULTI_STREAM_CONTROLLER_HEADER_H +#define MINDSPORE_CCSRC_RUNTIME_DEVICE_MULTI_STREAM_CONTROLLER_HEADER_H + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "hardware/hardware_abstract/device_event.h" +#include "hardware/hardware_abstract/device_context.h" +#include "hardware/hardware_abstract/visible.h" + +namespace mindspore { +namespace device { +class SpinLock { + public: + void lock() { + while (locked.test_and_set(std::memory_order_acquire)) { + } + } + + void unlock() { locked.clear(std::memory_order_release); } + + private: + std::atomic_flag locked = ATOMIC_FLAG_INIT; +}; +class TaskIdOnStreamManager; +using TaskIdOnStreamManagerPtr = std::shared_ptr; + +class EventPool; +using EventPoolPtr = std::shared_ptr; + +class HARDWARE_EXPORT MultiStreamController { + public: + explicit MultiStreamController(DeviceResManager *device_res_base); + + MultiStreamController(const MultiStreamController &) = delete; + MultiStreamController &operator=(const MultiStreamController &) = delete; + MultiStreamController(const MultiStreamController &&) = delete; + + ~MultiStreamController() = default; + + void Refresh(); + + bool UpdateTaskIdOnStream(int64_t task_id_on_stream, uint32_t user_stream_id, uint32_t memory_stream_id); + + int64_t QueryTaskIdOnStream(uint32_t user_stream_id, uint32_t memory_stream_id); + + int64_t LaunchTaskIdOnStream(uint32_t stream_id); + int64_t GetTaskIdOnStream(uint32_t stream_id); + + std::mutex &GetStreamMutex(size_t stream_id); + + // memory_stream_addresses pair : memory_stream_id, address. + bool RecordEvent(int64_t task_id_on_stream, uint32_t user_stream_id, + const std::vector> &memory_stream_addresses, + const DeviceEventPtr &input_event = nullptr); + bool WaitEvent(int64_t task_id_on_stream, uint32_t user_stream_id, uint32_t memory_stream_id); + bool WaitEvent(int64_t task_id_on_stream, uint32_t user_stream_id); + bool DispatchRecordWaitEvent(uint32_t user_stream_id, uint32_t memory_stream_id); + + bool SyncStream(size_t stream_id); + bool SyncAllStreams(); + bool SyncNotDefaultStreams(); + + bool WaitMultiStream(size_t wait_stream_id); + + protected: + TaskIdOnStreamManagerPtr task_id_on_stream_manager_; + std::unordered_map stream_mutexes_; + EventPoolPtr event_pool_; + + DeviceResManager *device_res_base_; + SpinLock lock_; +}; +using MultiStreamControllerPtr = std::shared_ptr; +} // namespace device +} // namespace mindspore +#endif diff --git a/inferrt/src/hardware/hardware_abstract/stream_util.h b/inferrt/src/hardware/hardware_abstract/stream_util.h new file mode 100644 index 0000000000000000000000000000000000000000..59a21bc129b235d440b12622d9b8027aba4acabf --- /dev/null +++ b/inferrt/src/hardware/hardware_abstract/stream_util.h @@ -0,0 +1,23 @@ +/** + * Copyright 2024-2024 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_INCLUDE_COMMON_UTILS_STREAM_UTIL_H_ +#define MINDSPORE_CCSRC_INCLUDE_COMMON_UTILS_STREAM_UTIL_H_ +namespace mindspore { +constexpr auto kDefaultStreamIndex = 0; +constexpr auto kWorldGroupStreamIndex = 1; +} // namespace mindspore +#endif diff --git a/inferrt/src/hardware/hardware_abstract/visible.h b/inferrt/src/hardware/hardware_abstract/visible.h new file mode 100644 index 0000000000000000000000000000000000000000..4a3fc96c5a2cea87921ba246db95ff986ffcdbba --- /dev/null +++ b/inferrt/src/hardware/hardware_abstract/visible.h @@ -0,0 +1,32 @@ +/** + * Copyright 2025 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef INFERRT_SRC_HARDWARE_VISIBLE_H_ +#define INFERRT_SRC_HARDWARE_VISIBLE_H_ + +#if (defined(_WIN32) || defined(__WIN32__) || defined(WIN32) || defined(__CYGWIN__)) +#ifdef HARDWARE_DLL +#define HARDWARE_EXPORT __declspec(dllexport) +#else +#define HARDWARE_EXPORT __declspec(dllimport) +#endif +#define HARDWARE_LOCAL +#else +#define HARDWARE_EXPORT __attribute__((visibility("default"))) +#define HARDWARE_LOCAL __attribute__((visibility("hidden"))) +#endif + +#endif // INFERRT_SRC_HARDWARE_VISIBLE_H_ diff --git a/inferrt/src/hardware/tests/CMakeLists.txt b/inferrt/src/hardware/tests/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e4abec03f43ff40c71cec65c3555a42a4a9d0d1 --- /dev/null +++ b/inferrt/src/hardware/tests/CMakeLists.txt @@ -0,0 +1,32 @@ +check_debug_log_out() + +if(DEFINED ENV{ASCEND_CUSTOM_PATH}) + set(ASCEND_PATH $ENV{ASCEND_CUSTOM_PATH}) +else() + set(ASCEND_PATH /usr/local/Ascend) +endif() + +message("Note compile ascend path: ${ASCEND_PATH}") +include_directories(${ASCEND_PATH}/latest/include/) +link_directories(${ASCEND_PATH}/latest/lib64/) + +find_package(Python3 COMPONENTS Interpreter Development REQUIRED) +include_directories(${Python3_INCLUDE_DIRS}) + + +set(depname "pybind11") +set(PYBIND11_PATH "${PROJECT_SOURCE_DIR}/${depname}-src") +include_directories(${PYBIND11_PATH}/include) + +set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -ldl") + +add_executable(test_obj test_func.cc) +target_link_libraries(test_obj + -Wl,--whole-archive + hardware_ascend_obj + -Wl,--no-whole-archive + hardware_abstract_obj + Python3::Python + ascendcl + runtime + ) \ No newline at end of file diff --git a/inferrt/src/hardware/tests/test_func.cc b/inferrt/src/hardware/tests/test_func.cc new file mode 100644 index 0000000000000000000000000000000000000000..38bb570d4f33c06a187d4ca65b3659e3faed989e --- /dev/null +++ b/inferrt/src/hardware/tests/test_func.cc @@ -0,0 +1,25 @@ +#include "hardware/ascend/ascend_device_context.h" +#include "hardware/hardware_abstract/device_context_manager.h" +#include "common/common.h" + +using namespace mindspore::device; +int main() { + DeviceContextKey device_context_key{"Ascend", 0}; + auto device_context = DeviceContextManager::GetInstance().GetOrCreateDeviceContext(device_context_key); + if (device_context == nullptr) { + LOG_ERROR << "Get device context failed."; + return 0; + } + if (device_context->device_res_manager_ == nullptr) { + LOG_ERROR << "Get device res manager failed."; + return 0; + } + device_context->Initialize(); + + // Test allocate memory. + auto ptr = device_context->device_res_manager_->AllocateMemory(8); + LOG_ERROR << "ptr:" << ptr; + device_context->device_res_manager_->FreeMemory(ptr); + + return 0; +}