From 52d457ab8ec58e1233527885ab39931e6eb850eb Mon Sep 17 00:00:00 2001 From: ZhouChen Date: Wed, 14 May 2025 07:54:43 +0000 Subject: [PATCH] !2967 release const mem after init_graph run Merge pull request !2967 from ZhouChen/const_mem_release --- tf_adapter/kernels/geop_npu.cc | 11 +++ .../scoped_graph_manager.py | 33 +++++++ tf_adapter/swig/ge_plugin.i | 10 ++- .../st/kernels/testcase/geop_npu_test.cc | 14 +++ .../ut/kernels/testcase/geop_npu_test.cc | 14 +++ tf_adapter/util/scoped_graph_manager.cc | 87 +++++++++++++++++++ tf_adapter/util/scoped_graph_manager.h | 59 +++++++++++++ .../util/scoped_graph_manager_interface.cc | 26 ++++++ .../util/scoped_graph_manager_interface.h | 25 ++++++ 9 files changed, 278 insertions(+), 1 deletion(-) create mode 100644 tf_adapter/python/npu_bridge/scoped_graph_manager/scoped_graph_manager.py create mode 100644 tf_adapter/util/scoped_graph_manager.cc create mode 100644 tf_adapter/util/scoped_graph_manager.h create mode 100644 tf_adapter/util/scoped_graph_manager_interface.cc create mode 100644 tf_adapter/util/scoped_graph_manager_interface.h diff --git a/tf_adapter/kernels/geop_npu.cc b/tf_adapter/kernels/geop_npu.cc index 80fa15a3b..45008c9a8 100644 --- a/tf_adapter/kernels/geop_npu.cc +++ b/tf_adapter/kernels/geop_npu.cc @@ -81,6 +81,8 @@ #include "tensorflow/core/graph/algorithm.h" #include "tensorflow/core/framework/graph_to_functiondef.h" #include "tf_adapter/util/profiler.h" +#include "tf_adapter/util/scoped_graph_manager_interface.h" +#include "tf_adapter/util/scoped_graph_manager.h" namespace tensorflow { #ifdef TF_VERSION_TF2 @@ -1099,6 +1101,7 @@ Status GeOp::AddGraph(OpKernelContext *ctx, const uint32_t &graph_id) { ctx->num_outputs(), sess_options_, init_options_, graph_options); ADP_LOG(EVENT) << "[GEOP] call ge session add graph jit_compile: " << jit_compile_; graph_options["ge.exec.graphIOMemAllocMode"] = "ByGE"; + const auto graph_option_ascend_string = ChangeStringToAscendString(graph_options); ADP_LOG(INFO) << "Graph options: "; NpuAttrs::LogOptions(graph_options); @@ -1293,6 +1296,14 @@ Status GeOp::CompileAndRunGraph(OpKernelContext *ctx, const std::vector &input_shapes, DoneCallback done) { mutex_lock lock{graph_handler_.graph_mu}; + + bool is_life_control_enabled = ScopedGraphManager::Instance().IsControlEnabled(); + if (is_life_control_enabled) { + ADP_LOG(INFO) << "[GEOP] Life control enabled, set graph options of const life cycle."; + NPU_REQUIRES(ScopedGraphManager::Instance().SetGraph(tf_session_, graph_id_), + errors::Internal("Only support call sess.run once in scope of ScopedGraphManager.")); + graph_options_["ge.constLifecycle"] = "graph"; + } // 当其中一个线程处于compiling状态时,其他线程需要在此处wait,不能直接去编译 while (graph_handler_.status == Compiling) { ADP_LOG(INFO) << "Compiling wait, graph_status: " << graph_handler_.status; diff --git a/tf_adapter/python/npu_bridge/scoped_graph_manager/scoped_graph_manager.py b/tf_adapter/python/npu_bridge/scoped_graph_manager/scoped_graph_manager.py new file mode 100644 index 000000000..73f2d7d3f --- /dev/null +++ b/tf_adapter/python/npu_bridge/scoped_graph_manager/scoped_graph_manager.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from npu_bridge import tf_adapter + + +class ScopedGraphManager(object): + def __enter__(self): + self.start() + return self + + def __exit__(self, exe_type, exe_val, exc_tb): + self.stop() + + def start(self): + tf_adapter.EnableControl() + + def stop(self): + tf_adapter.Clear() \ No newline at end of file diff --git a/tf_adapter/swig/ge_plugin.i b/tf_adapter/swig/ge_plugin.i index 11091f6e3..c5f611e34 100644 --- a/tf_adapter/swig/ge_plugin.i +++ b/tf_adapter/swig/ge_plugin.i @@ -63,6 +63,10 @@ extern const std::string ProfilerStart(const std::string &level, const std::string &aic_metrics, const std::string &output_path); extern const std::string ProfilerStop(); + +extern void EnableControl(); + +extern void Clear(); %} %template(var_info_vec) std::vector; @@ -140,4 +144,8 @@ extern int32_t GetDeviceSatMode(); extern const std::string ProfilerStart(const std::string &level, const std::string &aic_metrics, const std::string &output_path); -extern const std::string ProfilerStop(); \ No newline at end of file +extern const std::string ProfilerStop(); + +extern void EnableControl(); + +extern void Clear(); \ No newline at end of file diff --git a/tf_adapter/tests/st/kernels/testcase/geop_npu_test.cc b/tf_adapter/tests/st/kernels/testcase/geop_npu_test.cc index 7af0dc2aa..330e3a61f 100644 --- a/tf_adapter/tests/st/kernels/testcase/geop_npu_test.cc +++ b/tf_adapter/tests/st/kernels/testcase/geop_npu_test.cc @@ -10,6 +10,8 @@ #include "ge_stub.h" #include "callback_executor.h" #include "tf_adapter/util/profiler_interface.h" +#include "tf_adapter/util/scoped_graph_manager_interface.h" +#include "tf_adapter/util/scoped_graph_manager.h" #define private public #include "tf_adapter/kernels/geop_npu.h" #undef private @@ -500,6 +502,18 @@ TEST_F(GeOpTest, GeOpFuncTestWithProfiling) { EXPECT_TRUE(ProfilerStop().empty()); } +TEST_F(GeOpTest, GeOpFuncTestWithLifeCycleControl) { + NpuClose(); + NodeDef node_def; + std::string graph_def_path = "tf_adapter/tests/ut/kernels/pbtxt/geop.pbtxt"; + gtl::InlinedVector inputs; + EnableControl(); + EXPECT_TRUE(ScopedGraphManager::Instance().IsControlEnabled() == true); + EXPECT_TRUE(GeOpRunGraphAsync(graph_def_path, inputs, node_def, "GeOp1_0").ok()); + Clear(); + EXPECT_TRUE(ScopedGraphManager::Instance().IsControlEnabled() == false); +} + TEST_F(GeOpTest, GeOpFuncTestWithProfilingDefaultAicMetircs) { NpuClose(); NodeDef node_def; diff --git a/tf_adapter/tests/ut/kernels/testcase/geop_npu_test.cc b/tf_adapter/tests/ut/kernels/testcase/geop_npu_test.cc index 43b579fc5..425d3685e 100644 --- a/tf_adapter/tests/ut/kernels/testcase/geop_npu_test.cc +++ b/tf_adapter/tests/ut/kernels/testcase/geop_npu_test.cc @@ -13,6 +13,8 @@ #include "tf_adapter/util/util.h" #include "callback_executor.h" #include "tf_adapter/util/profiler_interface.h" +#include "tf_adapter/util/scoped_graph_manager_interface.h" +#include "tf_adapter/util/scoped_graph_manager.h" #define private public #include "tf_adapter/kernels/geop_npu.h" #undef private @@ -247,6 +249,18 @@ TEST_F(GeOpTest, GeOpFuncTestWithProfiling) { EXPECT_TRUE(ProfilerStop().empty()); } +TEST_F(GeOpTest, GeOpFuncTestWithLifeCycleControl) { + NpuClose(); + NodeDef node_def; + std::string graph_def_path = "tf_adapter/tests/ut/kernels/pbtxt/geop.pbtxt"; + gtl::InlinedVector inputs; + EnableControl(); + EXPECT_TRUE(ScopedGraphManager::Instance().IsControlEnabled() == true); + EXPECT_TRUE(GeOpRunGraphAsync(graph_def_path, inputs, node_def, "GeOp1_0").ok()); + Clear(); + EXPECT_TRUE(ScopedGraphManager::Instance().IsControlEnabled() == false); +} + TEST_F(GeOpTest, GeOpFuncTestWithProfilingDefaultAicMetircs) { NpuClose(); NodeDef node_def; diff --git a/tf_adapter/util/scoped_graph_manager.cc b/tf_adapter/util/scoped_graph_manager.cc new file mode 100644 index 000000000..069c383a0 --- /dev/null +++ b/tf_adapter/util/scoped_graph_manager.cc @@ -0,0 +1,87 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "scoped_graph_manager.h" +#include "tf_adapter/common/adapter_logger.h" +#include "tf_adapter/util/session_manager.h" +#include "npu_attrs.h" + +namespace tensorflow { +uint32_t ScopedGraphManager::graph_id_ = UINT32_MAX; +std::string ScopedGraphManager::tf_session_; +std::mutex ScopedGraphManager::mutex_; +bool ScopedGraphManager::graph_life_control_enabled_; + +ScopedGraphManager& ScopedGraphManager::Instance() { + static ScopedGraphManager instance; + return instance; +} + +void ScopedGraphManager::EnableControl() { + std::lock_guard lock(mutex_); + graph_life_control_enabled_ = true; + ADP_LOG(INFO) << "[ScopedGraphManager] Set graph_life_control_enabled_ true"; +} + +void ScopedGraphManager::DisableControl() { + std::lock_guard lock(mutex_); + graph_life_control_enabled_ = false; + graph_id_ = UINT32_MAX; + tf_session_.clear(); + ADP_LOG(INFO) << "[ScopedGraphManager] Set graph_life_control_enabled_ false"; +} + +bool ScopedGraphManager::IsControlEnabled() const { + ADP_LOG(INFO) << "[ScopedGraphManager] Get graph_life_control_enabled_: " << graph_life_control_enabled_; + return graph_life_control_enabled_; +} + +bool ScopedGraphManager::SetGraph(const std::string& tf_session, const uint32_t& graph_id) { + if (graph_id_ != UINT32_MAX) { + ADP_LOG(ERROR) << "[ScopedGraphManager] Only support call sess.run once in scope of ScopedGraphManager."; + return false; + } + ADP_LOG(INFO) << "[ScopedGraphManager] SetGraph tf_session: " << tf_session << ", graph_id: " << graph_id; + + { + std::lock_guard lock(mutex_); + tf_session_ = tf_session; + graph_id_ = graph_id; + } + + ADP_LOG(INFO) << "[ScopedGraphManager] SetGraph success for tf_session: " << tf_session << ", graph_id: " << graph_id; + return true; +} + +void ScopedGraphManager::Clear() { + ADP_LOG(INFO) << "[ScopedGraphManager] Begin to clear after graph run"; + { + std::lock_guard lock(mutex_); + ge::Session* global_ge_session = nullptr; + std::map global_sess_options; + if (!SessionManager::GetInstance().GetOrCreateGeSession(tf_session_, global_ge_session, global_sess_options)) { + ADP_LOG(WARNING) << "[ScopedGraphManager] Failed to get session for tf_session: " << tf_session_; + } + if (global_ge_session != nullptr) { + global_ge_session->RemoveGraph(graph_id_); + ADP_LOG(INFO) << "[ScopedGraphManager] RemoveGraph success for tf_session: " + << tf_session_ << ", graph_id: " << graph_id_; + } + } + DisableControl(); + ADP_LOG(INFO) << "[ScopedGraphManager] Clear finished"; +} +} \ No newline at end of file diff --git a/tf_adapter/util/scoped_graph_manager.h b/tf_adapter/util/scoped_graph_manager.h new file mode 100644 index 000000000..fe535183e --- /dev/null +++ b/tf_adapter/util/scoped_graph_manager.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef TENSORFLOW_UTILS_SCOPED_GRAPH_MANAGER_H_ +#define TENSORFLOW_UTILS_SCOPED_GRAPH_MANAGER_H_ +#include "acl/acl_prof.h" +#include +#include "tensorflow/core/platform/mutex.h" +#include "tensorflow/core/lib/core/status.h" +#include + + +namespace tensorflow { + +class ScopedGraphManager { +public: + static ScopedGraphManager& Instance(); + + // 启用图生命周期控制 + void EnableControl(); + + // 禁用图生命周期控制 + void DisableControl(); + + bool IsControlEnabled() const; + + // 注册图 + bool SetGraph(const std::string& tf_session, const uint32_t& graph_id); + + // 清理状态,卸载图并释放其占用内存 + void Clear(); + +private: + ScopedGraphManager() = default; + + static uint32_t graph_id_; + + static std::string tf_session_; + + static std::mutex mutex_; + + static bool graph_life_control_enabled_; +}; +} + +#endif \ No newline at end of file diff --git a/tf_adapter/util/scoped_graph_manager_interface.cc b/tf_adapter/util/scoped_graph_manager_interface.cc new file mode 100644 index 000000000..233faec1a --- /dev/null +++ b/tf_adapter/util/scoped_graph_manager_interface.cc @@ -0,0 +1,26 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "scoped_graph_manager_interface.h" +#include "scoped_graph_manager.h" + +const void EnableControl() { + return tensorflow::ScopedGraphManager::Instance().EnableControl(); +} + +const void Clear() { + return tensorflow::ScopedGraphManager::Instance().Clear(); +} \ No newline at end of file diff --git a/tf_adapter/util/scoped_graph_manager_interface.h b/tf_adapter/util/scoped_graph_manager_interface.h new file mode 100644 index 000000000..f380bd9ae --- /dev/null +++ b/tf_adapter/util/scoped_graph_manager_interface.h @@ -0,0 +1,25 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef TENSORFLOW_UTILS_SCOPED_GRAPH_MANAGER_INTERFACE_H_ +#define TENSORFLOW_UTILS_SCOPED_GRAPH_MANAGER_INTERFACE_H_ +#include + +const void EnableControl(); + +const void Clear(); + +#endif \ No newline at end of file -- Gitee