From 4cfcabcb4c445b23c59dd38abea0e4cea8c9e199 Mon Sep 17 00:00:00 2001 From: xuepeng Date: Wed, 23 Nov 2022 19:32:46 +0800 Subject: [PATCH 1/4] add LoadAndExecuteOm op --- tf_adapter/kernels/om_ops.cc | 92 ++++++++++++++++++++++++++++++++++++ tf_adapter/ops/npu_ops.cc | 5 ++ 2 files changed, 97 insertions(+) create mode 100644 tf_adapter/kernels/om_ops.cc diff --git a/tf_adapter/kernels/om_ops.cc b/tf_adapter/kernels/om_ops.cc new file mode 100644 index 000000000..d06b8d349 --- /dev/null +++ b/tf_adapter/kernels/om_ops.cc @@ -0,0 +1,92 @@ +#include "tensorflow/core/framework/common_shape_fns.h" +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/op_kernel.h" + +#include "acl_mdl.h" + +namespace tensorflow { +namespace { +class OmExecutor { + public: + /// + /// \param om_path Absolute file path of the om file + /// \param executor Created om executor + /// \return Status::OK() or error status if any error occurs + static Status Create(const std::string &om_path, std::unique_ptr &executor) { + // TODO: 根据OM文件路径创建OmExecutor,缓存全部可以预创建的信息 + } + + /// + /// \param inputs Tensorflow host input tensors + /// \param outputs Empty output tensors to be filling + /// \return Status::OK() or error status if any error occurs + Status Execute(const std::vector &inputs, std::vector &outputs) { + // TODO: OM执行逻辑,in为TF的Host tensor,outputs为空vector + } + + private: + static Status TensorsToAclDataset(const std::vector &inputs, std::unique_ptr &dataset) { + // TODO: Tensors到aclmdlDataset的转换 + return Status::OK(); + } + static Status AclDatasetToTensors(aclmdlDataset *dataset, std::vector &tensors) { + // TODO: aclmdlDataset到Tensors的转换 + return Status::OK(); + } + // TODO: OmExecutor::Create时成员变量这里缓存全部可以预创建的信息,包括但不限于: + // model id + // stream + // aclmdlDataset*对象,不需要每次创建 + // 静态shape时可填充完全体 + // 动态shape时可以设置除了地址和shape外的全部信息 +}; + +class LoadAndExecuteOmOp : public OpKernel { + public: + explicit LoadAndExecuteOmOp(OpKernelConstruction *ctx) : OpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("om_path", &om_path_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("executor_type", &executor_type_)); + } + ~LoadAndExecuteOmOp() override = default; + + void Compute(OpKernelContext *ctx) override { + OP_REQUIRES_OK(ctx, Initialize()); + std::vector inputs; + inputs.reserve(ctx->num_inputs()); + for (int i = 0; i < ctx->num_inputs(); i++) { + inputs.push_back(ctx->input(i)); + } + + std::vector outputs; + OP_REQUIRES_OK(ctx, executor_->Execute(inputs, outputs)); + + for (int i = 0; i < static_cast(outputs.size()); i++) { + ctx->set_output(i, std::move(outputs[i])); + } + } + + private: + Status Initialize() { + if (initialized_) { // 类似读写锁的作用,走SyncWith,不用抢Mutex锁判断是否初始化 + return Status::OK(); + } + std::unique_lock lk(mu_); + if (initialized_) { // 需要二次确认 + return Status::OK(); + } + // TODO: 将om_path_转换为绝对路径 + TF_RETURN_IF_ERROR(OmExecutor::Create(om_path_, executor_)); + initialized_ = true; + return Status::OK(); + } + + std::mutex mu_; + std::atomic_bool initialized_{false}; + std::string om_path_; + std::string executor_type_; // Reserved + + std::unique_ptr executor_; +}; +} // namespace +REGISTER_KERNEL_BUILDER(Name("LoadAndExecuteOm").Device(DEVICE_CPU), LoadAndExecuteOmOp); +} // namespace tensorflow diff --git a/tf_adapter/ops/npu_ops.cc b/tf_adapter/ops/npu_ops.cc index 3f90830b0..830fd2798 100644 --- a/tf_adapter/ops/npu_ops.cc +++ b/tf_adapter/ops/npu_ops.cc @@ -34,6 +34,11 @@ REGISTER_OP("GeOp") .Attr("data_format: { 'NHWC', 'NCHW', 'NDHWC', 'NCDHW', 'DHWCN', 'DHWNC', 'ND'} = 'NHWC'") .SetIsStateful(); +REGISTER_OP("LoadAndExecuteOm") + .Attr("om_path: string") + .Attr("executor_type: string = ''") + .SetIsStateful(); + REGISTER_OP("DPOP") .Input("inputs: Tin") .Attr("Tin: list(type) >= 0") -- Gitee From 985d9d66f0725455d0982d31994a1191cbdf29b2 Mon Sep 17 00:00:00 2001 From: xuepeng Date: Thu, 24 Nov 2022 16:26:24 +0800 Subject: [PATCH 2/4] add pass for process om node --- tf_adapter/kernels/om_ops.cc | 34 ++--- tf_adapter/ops/npu_ops.cc | 7 +- .../optimizers/get_attr_optimize_pass.cc | 2 +- tf_adapter/optimizers/mark_start_node_pass.cc | 2 +- tf_adapter/optimizers/om_node_prepare_pass.cc | 127 ++++++++++++++++++ 5 files changed, 152 insertions(+), 20 deletions(-) create mode 100644 tf_adapter/optimizers/om_node_prepare_pass.cc diff --git a/tf_adapter/kernels/om_ops.cc b/tf_adapter/kernels/om_ops.cc index d06b8d349..e5922d8ce 100644 --- a/tf_adapter/kernels/om_ops.cc +++ b/tf_adapter/kernels/om_ops.cc @@ -2,8 +2,6 @@ #include "tensorflow/core/framework/op.h" #include "tensorflow/core/framework/op_kernel.h" -#include "acl_mdl.h" - namespace tensorflow { namespace { class OmExecutor { @@ -13,7 +11,11 @@ class OmExecutor { /// \param executor Created om executor /// \return Status::OK() or error status if any error occurs static Status Create(const std::string &om_path, std::unique_ptr &executor) { - // TODO: 根据OM文件路径创建OmExecutor,缓存全部可以预创建的信息 + executor.reset(new (std::nothrow) OmExecutor(om_path)); + if (executor == nullptr) { + return errors::Internal("Failed create executor for om ", om_path); + } + return Status::OK(); } /// @@ -22,23 +24,11 @@ class OmExecutor { /// \return Status::OK() or error status if any error occurs Status Execute(const std::vector &inputs, std::vector &outputs) { // TODO: OM执行逻辑,in为TF的Host tensor,outputs为空vector + return Status::OK(); } private: - static Status TensorsToAclDataset(const std::vector &inputs, std::unique_ptr &dataset) { - // TODO: Tensors到aclmdlDataset的转换 - return Status::OK(); - } - static Status AclDatasetToTensors(aclmdlDataset *dataset, std::vector &tensors) { - // TODO: aclmdlDataset到Tensors的转换 - return Status::OK(); - } - // TODO: OmExecutor::Create时成员变量这里缓存全部可以预创建的信息,包括但不限于: - // model id - // stream - // aclmdlDataset*对象,不需要每次创建 - // 静态shape时可填充完全体 - // 动态shape时可以设置除了地址和shape外的全部信息 + explicit OmExecutor(const std::string &om_path) {} }; class LoadAndExecuteOmOp : public OpKernel { @@ -60,6 +50,16 @@ class LoadAndExecuteOmOp : public OpKernel { std::vector outputs; OP_REQUIRES_OK(ctx, executor_->Execute(inputs, outputs)); + // TODO: Remove this after om executor ready + /************************************************/ + if (outputs.empty()) { + for (int i = 0; i < ctx->num_outputs(); i++) { + Tensor *unused_tensor = nullptr; + OP_REQUIRES_OK(ctx, ctx->allocate_output(i, TensorShape{}, &unused_tensor)); + } + } + /************************************************/ + for (int i = 0; i < static_cast(outputs.size()); i++) { ctx->set_output(i, std::move(outputs[i])); } diff --git a/tf_adapter/ops/npu_ops.cc b/tf_adapter/ops/npu_ops.cc index 830fd2798..8bfad5270 100644 --- a/tf_adapter/ops/npu_ops.cc +++ b/tf_adapter/ops/npu_ops.cc @@ -35,9 +35,14 @@ REGISTER_OP("GeOp") .SetIsStateful(); REGISTER_OP("LoadAndExecuteOm") + .Input("inputs: Tin") + .Attr("Tin: list(type) >= 0") + .Output("outputs: output_dtypes") + .Attr("output_dtypes: list(type) >= 0") .Attr("om_path: string") .Attr("executor_type: string = ''") - .SetIsStateful(); + .SetIsStateful() + .SetShapeFn(shape_inference::UnknownShape); REGISTER_OP("DPOP") .Input("inputs: Tin") diff --git a/tf_adapter/optimizers/get_attr_optimize_pass.cc b/tf_adapter/optimizers/get_attr_optimize_pass.cc index 7a8f88cd2..d3a1d78aa 100644 --- a/tf_adapter/optimizers/get_attr_optimize_pass.cc +++ b/tf_adapter/optimizers/get_attr_optimize_pass.cc @@ -115,5 +115,5 @@ Status GetAttrOptimizePass::Run(const GraphOptimizationPassOptions &options) { return Status::OK(); } -REGISTER_OPTIMIZATION(OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, 0, GetAttrOptimizePass); +REGISTER_OPTIMIZATION(OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, -1, GetAttrOptimizePass); } // namespace tensorflow diff --git a/tf_adapter/optimizers/mark_start_node_pass.cc b/tf_adapter/optimizers/mark_start_node_pass.cc index 60b1fafa3..f2223ae6e 100644 --- a/tf_adapter/optimizers/mark_start_node_pass.cc +++ b/tf_adapter/optimizers/mark_start_node_pass.cc @@ -183,5 +183,5 @@ Status MarkStartNodePass::TraverseNode(const Node *start_node) { return Status::OK(); } -REGISTER_OPTIMIZATION(OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, 0, MarkStartNodePass); +REGISTER_OPTIMIZATION(OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, -1, MarkStartNodePass); } // namespace tensorflow diff --git a/tf_adapter/optimizers/om_node_prepare_pass.cc b/tf_adapter/optimizers/om_node_prepare_pass.cc new file mode 100644 index 000000000..024ed96da --- /dev/null +++ b/tf_adapter/optimizers/om_node_prepare_pass.cc @@ -0,0 +1,127 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2019-2020. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "tf_adapter/common/adapter_logger.h" +#include "tf_adapter/common/common.h" +#include "tf_adapter/util/npu_attrs.h" + +#include "tensorflow/core/common_runtime/optimization_registry.h" +#include "tensorflow/core/graph/node_builder.h" + +namespace tensorflow { +class OmNodePreparePass : public GraphOptimizationPass { + public: + OmNodePreparePass() = default; + ~OmNodePreparePass() override = default; + Status Run(const GraphOptimizationPassOptions &options) override; + + private: + static std::vector GetGraphOmNodes(const Graph &graph); + static std::map GetGraphConfigs(const Graph &graph); + static Status ProcessGraph(std::unique_ptr &graph, FunctionLibraryDefinition &fdef_lib); +}; + +Status OmNodePreparePass::Run(const GraphOptimizationPassOptions &options) { + if ((options.graph == nullptr && options.partition_graphs == nullptr) || options.flib_def == nullptr) { + return Status::OK(); + } + + if (options.graph != nullptr) { + TF_RETURN_IF_ERROR(ProcessGraph(*options.graph, *options.flib_def)); + } else if (options.partition_graphs != nullptr) { + for (auto &partition_graph : *options.partition_graphs) { + TF_RETURN_IF_ERROR(ProcessGraph(partition_graph.second, *options.flib_def)); + } + } + + return Status::OK(); +} + +constexpr const char *kOmNodeType = "LoadAndExecuteOm"; +std::vector OmNodePreparePass::GetGraphOmNodes(const Graph &graph) { + std::vector om_nodes; + for (auto node : graph.nodes()) { + if (node->type_string() != kOmNodeType) { + continue; + } + ADP_LOG(INFO) << "Collect om node " << node->name() << " " << node->type_string(); + om_nodes.emplace_back(node); + } + return om_nodes; +} + +std::map OmNodePreparePass::GetGraphConfigs(const Graph &graph) { + for (Node *n : graph.nodes()) { + if ((n != nullptr) && (n->attrs().Find("_NpuOptimizer") != nullptr)) { + return NpuAttrs::GetAllAttrOptions(n->attrs()); + } + } + return {}; +} + +Status OmNodePreparePass::ProcessGraph(std::unique_ptr &graph, FunctionLibraryDefinition &fdef_lib) { + auto om_nodes = GetGraphOmNodes(*graph); + if (om_nodes.empty()) { + ADP_LOG(INFO) << "Skip process graph as no om nodes found"; + return Status::OK(); + } + + static std::atomic_uint64_t graph_index{0U}; + uint64_t index = graph_index.fetch_add(1U); + if (kDumpGraph) { + const std::string pbtxt_path = GetDumpPath() + "TF_BeforeOmPrepare_" + std::to_string(index) + ".pbtxt"; + (void) WriteTextProto(Env::Default(), pbtxt_path, graph->ToGraphDefDebug()); + } + + ADP_LOG(INFO) << "Prepare for om graph as " << om_nodes.size() << " om nodes found"; + std::string init_fun_name = fdef_lib.UniqueFunctionName("empty_for_npu_init_"); + tensorflow::AttrValue function_attr; + function_attr.mutable_func()->set_name(init_fun_name); + Node *geop_node = nullptr; + TF_RETURN_IF_ERROR(tensorflow::NodeBuilder("system_init", "GeOp") + .Input(std::vector{}) + .Attr("Tin", tensorflow::DataTypeVector{}) + .Attr("Tout", tensorflow::DataTypeVector{}) + .Attr("function", function_attr) + .Device(om_nodes.front()->assigned_device_name()) + .AssignedDevice(om_nodes.front()->assigned_device_name()) + .Finalize(graph.get(), &geop_node)); + + geop_node->AddAttr("_NpuOptimizer", "NpuOptimizer"); + for (const auto &option : GetGraphConfigs(*graph)) { + geop_node->AddAttr(std::string("_") + option.first, option.second); + } + + tensorflow::FunctionDef fdef; + fdef.mutable_signature()->set_name(init_fun_name); + *fdef.mutable_attr() = geop_node->def().attr(); + TF_RETURN_IF_ERROR(fdef_lib.AddFunctionDef(fdef)); + + for (auto &om_node : om_nodes) { + om_node->AddAttr("_NoNeedOptimize", true); // Skip optimize for graph with om node + ADP_LOG(INFO) << "Add control edge from system init op " << geop_node->name() << " to om node " << om_node->name(); + REQUIRES_NOT_NULL(graph->AddControlEdge(geop_node, om_node)); + } + + if (kDumpGraph) { + const std::string pbtxt_path = GetDumpPath() + "TF_AfterOmPrepare_" + std::to_string(index) + ".pbtxt"; + (void) WriteTextProto(Env::Default(), pbtxt_path, graph->ToGraphDefDebug()); + } + + return Status::OK(); +} +REGISTER_OPTIMIZATION(OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, 0, OmNodePreparePass); +} // namespace tensorflow -- Gitee From b261760146224db03b7a68aefe5c4aaa1f905f89 Mon Sep 17 00:00:00 2001 From: xuepeng Date: Sat, 26 Nov 2022 11:04:38 +0800 Subject: [PATCH 3/4] make om executor an standalon class --- tf_adapter/kernels/om_executor.cc | 35 +++++++++++++++ tf_adapter/kernels/om_executor.h | 40 +++++++++++++++++ tf_adapter/kernels/om_ops.cc | 45 ++++++++----------- tf_adapter/optimizers/om_node_prepare_pass.cc | 2 +- 4 files changed, 94 insertions(+), 28 deletions(-) create mode 100644 tf_adapter/kernels/om_executor.cc create mode 100644 tf_adapter/kernels/om_executor.h diff --git a/tf_adapter/kernels/om_executor.cc b/tf_adapter/kernels/om_executor.cc new file mode 100644 index 000000000..f6725be4e --- /dev/null +++ b/tf_adapter/kernels/om_executor.cc @@ -0,0 +1,35 @@ +/* +* Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved. +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +#include "om_executor.h" + +namespace tensorflow { +OmExecutor::OmExecutor(const std::string &om_path) {} + +Status OmExecutor::Create(const std::string &om_path, std::unique_ptr &executor) { + // TODO: OM 执行器创建逻辑 + executor.reset(new (std::nothrow) OmExecutor(om_path)); + if (executor == nullptr) { + return errors::Internal("Failed create executor for om ", om_path); + } + return Status::OK(); +} + +Status OmExecutor::Execute(const std::vector &inputs, std::vector &outputs) { + // TODO: OM执行逻辑,in为TF的Host tensor,outputs为空vector + return Status::OK(); +} +} // namespace tensorflow diff --git a/tf_adapter/kernels/om_executor.h b/tf_adapter/kernels/om_executor.h new file mode 100644 index 000000000..1fc8f9c97 --- /dev/null +++ b/tf_adapter/kernels/om_executor.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef TENSORFLOW_KERNELS_OM_EXECUTOR_H_ +#define TENSORFLOW_KERNELS_OM_EXECUTOR_H_ +#include "tensorflow/core/framework/op_kernel.h" + +namespace tensorflow { +class OmExecutor { + public: + /// + /// \param om_path Absolute file path of the om file + /// \param executor Created om executor + /// \return Status::OK() or error status if any error occurs + static Status Create(const std::string &om_path, std::unique_ptr &executor); + + /// + /// \param inputs Tensorflow host input tensors + /// \param outputs Empty output tensors to be filling + /// \return Status::OK() or error status if any error occurs + Status Execute(const std::vector &inputs, std::vector &outputs); + + private: + explicit OmExecutor(const std::string &om_path); +}; +} // namespace tensorflow +#endif // TENSORFLOW_KERNELS_OM_EXECUTOR_H_ diff --git a/tf_adapter/kernels/om_ops.cc b/tf_adapter/kernels/om_ops.cc index e5922d8ce..d53cbd282 100644 --- a/tf_adapter/kernels/om_ops.cc +++ b/tf_adapter/kernels/om_ops.cc @@ -1,36 +1,27 @@ +/* +* Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved. +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + #include "tensorflow/core/framework/common_shape_fns.h" #include "tensorflow/core/framework/op.h" #include "tensorflow/core/framework/op_kernel.h" +#include "om_executor.h" + namespace tensorflow { namespace { -class OmExecutor { - public: - /// - /// \param om_path Absolute file path of the om file - /// \param executor Created om executor - /// \return Status::OK() or error status if any error occurs - static Status Create(const std::string &om_path, std::unique_ptr &executor) { - executor.reset(new (std::nothrow) OmExecutor(om_path)); - if (executor == nullptr) { - return errors::Internal("Failed create executor for om ", om_path); - } - return Status::OK(); - } - - /// - /// \param inputs Tensorflow host input tensors - /// \param outputs Empty output tensors to be filling - /// \return Status::OK() or error status if any error occurs - Status Execute(const std::vector &inputs, std::vector &outputs) { - // TODO: OM执行逻辑,in为TF的Host tensor,outputs为空vector - return Status::OK(); - } - - private: - explicit OmExecutor(const std::string &om_path) {} -}; - class LoadAndExecuteOmOp : public OpKernel { public: explicit LoadAndExecuteOmOp(OpKernelConstruction *ctx) : OpKernel(ctx) { diff --git a/tf_adapter/optimizers/om_node_prepare_pass.cc b/tf_adapter/optimizers/om_node_prepare_pass.cc index 024ed96da..b092e2603 100644 --- a/tf_adapter/optimizers/om_node_prepare_pass.cc +++ b/tf_adapter/optimizers/om_node_prepare_pass.cc @@ -1,5 +1,5 @@ /* - * Copyright (c) Huawei Technologies Co., Ltd. 2019-2020. All rights reserved. + * Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. -- Gitee From 214356a3856796be0ec742a2a8725aaf8ef03c28 Mon Sep 17 00:00:00 2001 From: "tanghaojie1@huawei.com" Date: Mon, 28 Nov 2022 22:31:04 +0800 Subject: [PATCH 4/4] tf_serving require --- tf_adapter/common/common.h | 11 + tf_adapter/kernels/om_executor.cc | 295 +++++++++++++++++- tf_adapter/kernels/om_executor.h | 54 +++- .../depends/ascendcl/src/ascendcl_stub.cc | 47 +++ .../depends/ascendcl/src/ascendcl_stub.h | 2 +- 5 files changed, 400 insertions(+), 9 deletions(-) diff --git a/tf_adapter/common/common.h b/tf_adapter/common/common.h index fb3222de2..5cccdefa2 100644 --- a/tf_adapter/common/common.h +++ b/tf_adapter/common/common.h @@ -20,6 +20,7 @@ #include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/platform/env.h" #include "tf_adapter/common/adapter_logger.h" +#include "acl/acl_base.h" #define CHECK_NOT_NULL(v) \ do { \ @@ -44,6 +45,16 @@ } \ } while (false) +#define REQUIRES_ACL_STATUS_OK(expr, interface) \ + do { \ + const auto __ret = (expr); \ + if (__ret != ACL_SUCCESS) { \ + LOG(ERROR) << #interface " is failed, ret code is " << __ret; \ + return errors::Internal(#interface " is failed."); \ + } \ + } \ + while (false) + namespace npu { constexpr int ADAPTER_ENV_MAX_LENTH = 1024 * 1024; } // namespace npu diff --git a/tf_adapter/kernels/om_executor.cc b/tf_adapter/kernels/om_executor.cc index f6725be4e..36c05e2e4 100644 --- a/tf_adapter/kernels/om_executor.cc +++ b/tf_adapter/kernels/om_executor.cc @@ -15,12 +15,283 @@ */ #include "om_executor.h" +#include "tf_adapter/common/common.h" +#include "tf_adapter/util/npu_attrs.h" namespace tensorflow { -OmExecutor::OmExecutor(const std::string &om_path) {} +ModelProcess::~ModelProcess() { + UnloadModel(); + DestroyInput(); + DestroyOutput(); + aclrtResetDevice(device_id_); +} + +Status ModelProcess::PrepareProcess(const char *path) { + TF_RETURN_IF_ERROR(LoadModelFromFile(path)); + TF_RETURN_IF_ERROR(CreateInput()); + TF_RETURN_IF_ERROR(CreateOutput()); + return Status::OK(); +} + +bool ModelProcess::isDynamic(const aclmdlIODims &dims) { + for (size_t i = 0; i < dims.dimCount; ++i) { + if ((dims.dims[i] == -1) || (dims.dims[i] == -2)) { + return true; + } + } + return false; +} + +Status ModelProcess::LoadModelFromFile(const char *path) { + // const auto acl_ret = aclInit(nullptr); + // if ((acl_ret != ACL_SUCCESS) && (acl_ret != ACL_ERROR_REPEAT_INITIALIZE)) { + // return tensorflow::errors::Internal("aclInit faile"); + // } + (void)GetEnvDeviceID(device_id_); + REQUIRES_ACL_STATUS_OK(aclrtSetDevice(device_id_), aclrtSetDevice); + REQUIRES_ACL_STATUS_OK(aclmdlLoadFromFile(path, &model_id_), aclmdlLoadFromFile); + load_flag_ = true; + model_desc_ = aclmdlCreateDesc(); + REQUIRES_NOT_NULL(model_desc_); + REQUIRES_ACL_STATUS_OK(aclmdlGetDesc(model_desc_, model_id_), aclmdlGetDesc); + aclmdlIODims dims = {}; + for (size_t i = 0U; i < aclmdlGetNumInputs(model_desc_); ++i) { + REQUIRES_ACL_STATUS_OK(aclmdlGetInputDims(model_desc_, i, &dims), aclmdlGetInputDims); + if (isDynamic(dims)) { + is_input_dynamic.emplace_back(true); + } else { + is_input_dynamic.emplace_back(false); + } + LOG(INFO) << "this "<< i << " input is " << is_input_dynamic[i]; + } + for (size_t j = 0U; j < aclmdlGetNumOutputs(model_desc_); ++j) { + REQUIRES_ACL_STATUS_OK(aclmdlGetOutputDims(model_desc_, j, &dims), aclmdlGetOutputDims); + if (isDynamic(dims)) { + is_output_dynamic.emplace_back(true); + } else { + is_output_dynamic.emplace_back(false); + } + LOG(INFO) << "this "<< j << " output is " << is_output_dynamic[j]; + } + return Status::OK(); +} + +Status ModelProcess::CreateInput() { + input_ = aclmdlCreateDataset(); + REQUIRES_NOT_NULL(input_); + size_t input_num = aclmdlGetNumInputs(model_desc_); + for (size_t i = 0U; i < input_num; ++i) { + size_t input_size = aclmdlGetInputSizeByIndex(model_desc_, i); + LOG(INFO) << "this "<< i << " input size is " << input_size; + if (input_size == 0U) { + LOG(ERROR) << "current " << i << " input is 0, can not get its real size"; + return tensorflow::errors::Internal("get input size is 0"); + } + void *dev_ptr = nullptr; + REQUIRES_ACL_STATUS_OK(aclrtMalloc(&dev_ptr, input_size, ACL_MEM_MALLOC_NORMAL_ONLY), aclrtMalloc); + REQUIRES_NOT_NULL(dev_ptr); + aclDataBuffer *data_buf = aclCreateDataBuffer(dev_ptr, input_size); + REQUIRES_NOT_NULL(data_buf); + REQUIRES_ACL_STATUS_OK(aclmdlAddDatasetBuffer(input_, data_buf), aclmdlAddDatasetBuffer); + } + return Status::OK(); +} + +Status ModelProcess::CreateOutput() { + output_ = aclmdlCreateDataset(); + REQUIRES_NOT_NULL(output_); + size_t output_num = aclmdlGetNumOutputs(model_desc_); + for (size_t i = 0U; i < output_num; ++i) { + size_t output_size = aclmdlGetOutputSizeByIndex(model_desc_, i); + LOG(INFO) << "this "<< i << " output size is " << output_size; + if (output_size == 0U) { + LOG(ERROR) << "current " << i << " output is 0, can not get its real size"; + return tensorflow::errors::Internal("get input size is 0"); + } + void *dev_ptr = nullptr; + REQUIRES_ACL_STATUS_OK(aclrtMalloc(&dev_ptr, output_size, ACL_MEM_MALLOC_NORMAL_ONLY), aclrtMalloc); + REQUIRES_NOT_NULL(dev_ptr); + aclDataBuffer *data_buf = aclCreateDataBuffer(dev_ptr, output_size); + REQUIRES_NOT_NULL(data_buf); + REQUIRES_ACL_STATUS_OK(aclmdlAddDatasetBuffer(output_, data_buf), aclmdlAddDatasetBuffer); + } + return Status::OK(); +} + +Status ModelProcess::Execute(const std::vector &inputs, std::vector &outputs) { + TF_RETURN_IF_ERROR(FeedInput(inputs)); + REQUIRES_ACL_STATUS_OK(aclmdlExecute(model_id_, input_, output_), aclmdlExecute); + TF_RETURN_IF_ERROR(ProcessOutput(outputs)); + return Status::OK(); +} + +Status ModelProcess::FeedInput(const std::vector &inputs) { + if (inputs.size() != aclmdlGetNumInputs(model_desc_)) { + LOG(ERROR) << "input num " << inputs.size() << " is not matched model input num " + << aclmdlGetNumInputs(model_desc_); + return tensorflow::errors::Internal("input num is not matched"); + } + for (size_t i = 0U; i < inputs.size(); ++i) { + auto tensor_data = inputs[i].tensor_data().data(); + auto tensor_size = inputs[i].tensor_data().size(); + aclDataBuffer *data_buf = aclmdlGetDatasetBuffer(input_, i); + REQUIRES_NOT_NULL(data_buf); + void *dev_ptr = aclGetDataBufferAddr(data_buf); + REQUIRES_NOT_NULL(dev_ptr); + size_t cur_size = aclGetDataBufferSizeV2(data_buf); + LOG(INFO) << "tensor size is " << tensor_size << " model cur size is " << cur_size; + if (tensor_size > cur_size) { + LOG(ERROR) << "input " << i << " size " << tensor_size << " is larger than model input size " << cur_size; + return tensorflow::errors::Internal("input size is too long"); + } + REQUIRES_ACL_STATUS_OK(aclrtMemcpy(dev_ptr, cur_size, + tensor_data, tensor_size, ACL_MEMCPY_HOST_TO_DEVICE), aclrtMemcpy); + // set shpae + tensorflow::DataType tf_type = inputs[i].dtype(); + aclDataType acl_dt = ACL_DT_UNDEFINED; + TF_RETURN_IF_ERROR(MappingTfDtToAcl(tf_type, acl_dt)); + auto dims = inputs[i].shape().dim_sizes(); + aclTensorDesc *tensor_desc = aclCreateTensorDesc(acl_dt, dims.size(), + (dims.empty() ? nullptr : reinterpret_cast(dims.data())), ACL_FORMAT_UNDEFINED); + REQUIRES_NOT_NULL(tensor_desc); + REQUIRES_ACL_STATUS_OK(aclmdlSetDatasetTensorDesc(input_, tensor_desc, i), aclmdlSetDatasetTensorDesc); + aclDestroyTensorDesc(tensor_desc); + tensor_desc = nullptr; + } + return Status::OK(); +} + +Status ModelProcess::ProcessOutput(std::vector &outputs) { + for (size_t i = 0U; i < aclmdlGetNumOutputs(model_desc_); ++i) { + aclDataBuffer *data_buf = aclmdlGetDatasetBuffer(output_, i); + REQUIRES_NOT_NULL(data_buf); + void *dev_ptr = aclGetDataBufferAddr(data_buf); + REQUIRES_NOT_NULL(dev_ptr); + size_t cur_size = aclGetDataBufferSizeV2(data_buf); + aclDataType acl_dt = aclmdlGetOutputDataType(model_desc_, i); + tensorflow::DataType tf_type = DT_FLOAT; + TF_RETURN_IF_ERROR(MappingAclDtToTf(acl_dt, tf_type)); + LOG(INFO) << " model output size is " << cur_size << " dt is " << tf_type; + if (!is_output_dynamic[i]) { + LOG(INFO) << "this out " << i << " is static"; + aclmdlIODims acl_dims = {}; + aclmdlGetOutputDims(model_desc_, i, &acl_dims); + TensorShape tf_shape; + for (size_t j = 0U; j < acl_dims.dimCount; ++j) { + tf_shape.AddDim(acl_dims.dims[j]); + } + Tensor tensor = Tensor(tf_type, tf_shape); + auto tensor_data = const_cast(tensor.tensor_data().data()); + auto tensor_size = tensor.tensor_data().size(); + if (cur_size != tensor_size) { + LOG(ERROR) << " cur_size is " << cur_size << " tensor_size is " << tensor_size; + return errors::Internal("output size is not match"); + } + REQUIRES_ACL_STATUS_OK( + aclrtMemcpy(tensor_data, tensor_size, dev_ptr, cur_size, ACL_MEMCPY_DEVICE_TO_HOST), aclrtMemcpy); + outputs.emplace_back(std::move(tensor)); + } else { + LOG(INFO) << "this out " << i << " is dynamic"; + auto *desc = aclmdlGetDatasetTensorDesc(output_, i); + REQUIRES_NOT_NULL(desc); + size_t real_size = aclGetTensorDescSize(desc); + LOG(INFO) << " get model output size is " << real_size; + TensorShape tf_shape; + size_t shape_size = aclGetTensorDescNumDims(desc); + LOG(INFO) << "get model output size is " << real_size << ", shape size is " << shape_size << " dt is " << tf_type; + int64_t cur_dim = 0; + for (size_t j = 0U; j < shape_size; ++j) { + REQUIRES_ACL_STATUS_OK(aclGetTensorDescDimV2(desc, i, &cur_dim), aclGetTensorDescDimV2); + tf_shape.AddDim(cur_dim); + } + Tensor tensor = Tensor(tf_type, tf_shape); + auto tensor_data = const_cast(tensor.tensor_data().data()); + auto tensor_size = tensor.tensor_data().size(); + if (cur_size < tensor_size) { + LOG(ERROR) << " cur_size is " << cur_size << " tensor_size is " << tensor_size; + return errors::Internal("output size is not match"); + } + REQUIRES_ACL_STATUS_OK( + aclrtMemcpy(tensor_data, tensor_size, dev_ptr, tensor_size, ACL_MEMCPY_DEVICE_TO_HOST), aclrtMemcpy); + outputs.emplace_back(std::move(tensor)); + } + } + return Status::OK(); +} + +Status ModelProcess::MappingTfDtToAcl(const tensorflow::DataType tf_type, aclDataType &acl_type) { + const static std::map type_mapping = { + {DT_FLOAT, ACL_FLOAT}, {DT_HALF, ACL_FLOAT16}, {DT_INT8, ACL_INT8}, {DT_INT32, ACL_INT32}, + {DT_UINT8, ACL_UINT8}, {DT_INT16, ACL_INT16}, {DT_UINT16, ACL_UINT16}, {DT_UINT32, ACL_UINT32}, + {DT_INT64, ACL_INT64}, {DT_UINT64, ACL_UINT64}, {DT_DOUBLE, ACL_DOUBLE}, {DT_BOOL, ACL_BOOL}, + {DT_STRING, ACL_STRING}}; + auto found = type_mapping.find(tf_type); + if (found == type_mapping.end()) { + return errors::Internal("Unsupported tf data type", DataTypeString(tf_type), " by acl."); + } + acl_type = found->second; + return Status::OK(); +} + +Status ModelProcess::MappingAclDtToTf(const aclDataType &acl_type, tensorflow::DataType &tf_type) { + const static std::map type_mapping = { + {ACL_FLOAT, DT_FLOAT}, {ACL_FLOAT16, DT_HALF}, {ACL_INT8, DT_INT8}, {ACL_INT32, DT_INT32}, + {ACL_UINT8, DT_UINT8}, {ACL_INT16, DT_INT16}, {ACL_UINT16, DT_UINT16}, {ACL_UINT32, DT_UINT32}, + {ACL_INT64, DT_INT64}, {ACL_UINT64, DT_UINT64}, {ACL_DOUBLE, DT_DOUBLE}, {ACL_BOOL, DT_BOOL}, + {ACL_STRING, DT_STRING}}; + auto found = type_mapping.find(acl_type); + if (found == type_mapping.end()) { + return errors::Internal("Acl channel receive unsupported data type", acl_type); + } + tf_type = found->second; + return Status::OK(); +} + +void ModelProcess::UnloadModel() { + if (!load_flag_) { + return; + } + (void)aclmdlUnload(model_id_); + + if (model_desc_ != nullptr) { + (void)aclmdlDestroyDesc(model_desc_); + model_desc_ = nullptr; + } + load_flag_ = false; +} + +void ModelProcess::DestroyInput() { + if (input_ == nullptr) { + return; + } + + for (size_t i = 0U; i < aclmdlGetDatasetNumBuffers(input_); ++i) { + aclDataBuffer *dataBuffer = aclmdlGetDatasetBuffer(input_, i); + void *data = aclGetDataBufferAddr(dataBuffer); + (void)aclrtFree(data); + (void)aclDestroyDataBuffer(dataBuffer); + } + (void)aclmdlDestroyDataset(input_); + input_ = nullptr; +} + +void ModelProcess::DestroyOutput() { + if (output_ == nullptr) { + return; + } + for (size_t i = 0U; i < aclmdlGetDatasetNumBuffers(output_); ++i) { + aclDataBuffer* dataBuffer = aclmdlGetDatasetBuffer(output_, i); + void* data = aclGetDataBufferAddr(dataBuffer); + (void)aclrtFree(data); + (void)aclDestroyDataBuffer(dataBuffer); + } + (void)aclmdlDestroyDataset(output_); + output_ = nullptr; +} + +OmExecutor::OmExecutor(const std::string &om_path) {om_path_ = om_path;} Status OmExecutor::Create(const std::string &om_path, std::unique_ptr &executor) { - // TODO: OM 执行器创建逻辑 executor.reset(new (std::nothrow) OmExecutor(om_path)); if (executor == nullptr) { return errors::Internal("Failed create executor for om ", om_path); @@ -29,7 +300,23 @@ Status OmExecutor::Create(const std::string &om_path, std::unique_ptr &inputs, std::vector &outputs) { - // TODO: OM执行逻辑,in为TF的Host tensor,outputs为空vector + if (!is_prepared) { + model_process_ = std::unique_ptr (new (std::nothrow) ModelProcess); + REQUIRES_NOT_NULL(model_process_); + auto status = model_process_->PrepareProcess(om_path_.c_str()); + if (!status.ok()) { + model_process_.release(); + return status; + } + is_prepared = true; + } + REQUIRES_NOT_NULL(model_process_); + // 每一次推理喂数据推理和输出数据 + auto status = model_process_->Execute(inputs, outputs); + if (!status.ok()) { + model_process_.release(); + return status; + } return Status::OK(); } -} // namespace tensorflow +} // namespace tensorflow diff --git a/tf_adapter/kernels/om_executor.h b/tf_adapter/kernels/om_executor.h index 1fc8f9c97..7a823d71e 100644 --- a/tf_adapter/kernels/om_executor.h +++ b/tf_adapter/kernels/om_executor.h @@ -17,24 +17,70 @@ #ifndef TENSORFLOW_KERNELS_OM_EXECUTOR_H_ #define TENSORFLOW_KERNELS_OM_EXECUTOR_H_ #include "tensorflow/core/framework/op_kernel.h" +#include "acl/acl.h" namespace tensorflow { +class ModelProcess { +public: + ModelProcess() = default; + + ~ModelProcess(); + + Status PrepareProcess(const char *path); + + bool isDynamic(const aclmdlIODims &dims); + + Status LoadModelFromFile(const char *path); + + Status CreateInput(); + + Status CreateOutput(); + + Status Execute(const std::vector &inputs, std::vector &outputs); + + Status FeedInput(const std::vector &inputs); + + Status ProcessOutput(std::vector &outputs); + +private: + Status MappingAclDtToTf(const aclDataType &acl_type, tensorflow::DataType &tf_type); + + Status MappingTfDtToAcl(const tensorflow::DataType tf_type, aclDataType &acl_type); + + void UnloadModel(); + + void DestroyInput(); + + void DestroyOutput(); + +private: + uint32_t model_id_ = UINT32_MAX; + uint32_t device_id_ = 0; + aclmdlDesc *model_desc_ = nullptr; + aclmdlDataset *input_ = nullptr; + aclmdlDataset *output_ = nullptr; + bool load_flag_ = false; + std::vector is_input_dynamic; + std::vector is_output_dynamic; +}; + class OmExecutor { - public: - /// +public: /// \param om_path Absolute file path of the om file /// \param executor Created om executor /// \return Status::OK() or error status if any error occurs static Status Create(const std::string &om_path, std::unique_ptr &executor); - /// /// \param inputs Tensorflow host input tensors /// \param outputs Empty output tensors to be filling /// \return Status::OK() or error status if any error occurs Status Execute(const std::vector &inputs, std::vector &outputs); - private: +private: explicit OmExecutor(const std::string &om_path); + std::string om_path_; + std::unique_ptr model_process_; + bool is_prepared = false; }; } // namespace tensorflow #endif // TENSORFLOW_KERNELS_OM_EXECUTOR_H_ diff --git a/tf_adapter/tests/depends/ascendcl/src/ascendcl_stub.cc b/tf_adapter/tests/depends/ascendcl/src/ascendcl_stub.cc index 197aa4c33..4096379c1 100644 --- a/tf_adapter/tests/depends/ascendcl/src/ascendcl_stub.cc +++ b/tf_adapter/tests/depends/ascendcl/src/ascendcl_stub.cc @@ -611,3 +611,50 @@ aclError aclrtGetDeviceSatMode(aclrtFloatOverflowMode *mode) { *mode = static_cast(deviceSatMode); return ACL_SUCCESS; } +aclError aclmdlUnload(uint32_t modelId) { + return ACL_SUCCESS; +} + +size_t aclGetDataBufferSizeV2(const aclDataBuffer *dataBuffer) { + return 0; +} + +aclDataType aclmdlGetOutputDataType(const aclmdlDesc *modelDesc, size_t index) { + return ACL_FLOAT; +} + +aclError aclmdlGetOutputDims(const aclmdlDesc *modelDesc, size_t index, aclmdlIODims *dims) { + return ACL_SUCCESS; +} + +aclError aclGetTensorDescDimV2(const aclTensorDesc *desc, size_t index, int64_t *dimSize) { + return ACL_SUCCESS; +} + +size_t aclmdlGetNumInputs(aclmdlDesc *modelDesc) { + return 0; +} + +aclError aclSetTensorShape(aclTensorDesc *desc, int numDims, const int64_t *dims) { + return ACL_SUCCESS; +} + +size_t aclmdlGetOutputSizeByIndex(aclmdlDesc *modelDesc, size_t index) { + return 0; +} + +size_t aclmdlGetInputSizeByIndex(aclmdlDesc *modelDesc, size_t index) { + return 0; +} + +aclError aclInit(const char *configPath) { + return ACL_SUCCESS; +} + +aclError aclmdlLoadFromFile(const char *modelPath, uint32_t *modelId) { + return ACL_SUCCESS; +} + +aclError aclmdlGetInputDims(const aclmdlDesc *modelDesc, size_t index, aclmdlIODims *dims) { + return ACL_SUCCESS; +} diff --git a/tf_adapter/tests/depends/ascendcl/src/ascendcl_stub.h b/tf_adapter/tests/depends/ascendcl/src/ascendcl_stub.h index e46c6e325..625a73831 100644 --- a/tf_adapter/tests/depends/ascendcl/src/ascendcl_stub.h +++ b/tf_adapter/tests/depends/ascendcl/src/ascendcl_stub.h @@ -53,7 +53,7 @@ struct acltdtDataItem { }; struct acltdtDataset { - acltdtDataset() : freeSelf(false) {}; + acltdtDataset() : freeSelf(false) {}; ~acltdtDataset() { if (freeSelf) { -- Gitee