From 70e0b1f7364294603df98c32fa0ced5c7dbafa89 Mon Sep 17 00:00:00 2001 From: guopeian Date: Fri, 5 Jul 2024 15:14:05 +0800 Subject: [PATCH] fix err msg --- tf_adapter/kernels/geop_npu.cc | 41 ++++++------ tf_adapter/kernels/npu_sys_ctl_ops.cc | 4 -- .../depends/ge_runner/src/ge_runner_stub.cc | 1 + .../ut/kernels/testcase/geop_npu_test.cc | 1 + tf_adapter/util/ge_plugin.cc | 63 ++++++++++--------- 5 files changed, 54 insertions(+), 56 deletions(-) diff --git a/tf_adapter/kernels/geop_npu.cc b/tf_adapter/kernels/geop_npu.cc index 8e5b40376..a93507251 100644 --- a/tf_adapter/kernels/geop_npu.cc +++ b/tf_adapter/kernels/geop_npu.cc @@ -192,10 +192,6 @@ class NpuHostGetNextAllocator : public tensorflow::Allocator, public tensorflow: std::unique_ptr output_; }; -inline string ToString(ge::Status status) { - return ::ge::StatusFactory::Instance()->GetErrDesc(status); -} - Status BuildStringOutput(geDataUniquePtr data_ptr, size_t output_size, Tensor &cpu_tensor) { TensorShape out_shape = cpu_tensor.shape(); if ((out_shape.num_elements() * sizeof(ge::StringHead)) >= output_size) { @@ -425,9 +421,12 @@ void GeOp::Initialize(OpKernelConstruction *ctx) { << ", jit_compile: " << jit_compile_ << ", is_dynamic_input: " << is_dynamic_input_ << ", getnext_inputs_shape_range: " << getnext_inputs_shape_range_ - << ", data_inputs_shape_range: " << data_inputs_shape_range_ << ", is_train_graph: " << is_train_graph_ - << ", is_dynamic_getnext: " << is_dynamic_getnext_ << ", placeholder_index: " << placeholder_index_ - << ", is_var_init_graph: " << is_var_init_graph_ << ", use_counter_filter: " << use_counter_filter_ + << ", data_inputs_shape_range: " << data_inputs_shape_range_ + << ", is_train_graph: " << is_train_graph_ + << ", is_dynamic_getnext: " << is_dynamic_getnext_ + << ", placeholder_index: " << placeholder_index_ + << ", is_var_init_graph: " << is_var_init_graph_ + << ", use_counter_filter: " << use_counter_filter_ << ", max_key_num: " << max_key_num_ << ", embedding_dim: " << embedding_dim_ << ", padding_key: " << padding_key_ << ", embedding_flags: " << embedding_flags_ << ", compile_dynamic_mode: " << compile_dynamic_mode_; @@ -889,11 +888,7 @@ void GeOp::GetExecGraphId(uint32_t &cache_graph_id, const std::vectorRemoveGraph(erased_graph_id); - if (status != ge::SUCCESS) { - ADP_LOG(WARNING) << "[GEOP] GE Remove Graph failed, ret : " << ToString(status); - LOG(WARNING) << "[GEOP] GE Remove Graph failed, ret : " << ToString(status); - } + (void)ge_session_->RemoveGraph(erased_graph_id); cache_graph_id = erased_graph_id; } else { cache_graph_id = graph_id_ + num; @@ -978,10 +973,10 @@ Status GeOp::CreateGeSession() { // create ge session should be ensure after getinit aysnc success const auto init_status = GePlugin::GetInstance()->GetInitStatus(); if (init_status != ge::SUCCESS) { - ADP_LOG(ERROR) << "[GePlugin] Init ge failed, ret : " << ToString(init_status); + ADP_LOG(ERROR) << "[GePlugin] Init ge failed"; const auto &error_message = GePlugin::GetInstance()->GetInitErrorMessage(); std::stringstream ss; - ss << "[GePlugin] Initialize ge failed, ret : " << ToString(init_status) << std::endl + ss << "[GePlugin] Initialize ge failed" << std::endl << "Error Message is : " << std::endl << error_message; LOG(ERROR) << ss.str(); @@ -1210,16 +1205,16 @@ Status GeOp::CompileGraph(OpKernelContext *ctx, const std::vector &input Tensor initialized_tensor(ctx->expected_output_dtype(0), TensorShape({0})); ctx->set_output(0, initialized_tensor); ADP_LOG(INFO) << "[GEOP] End GeOp::ComputeAsync, compute_graph is initialize, kernel_name:" - << ctx->op_kernel().name() << ", ret_status:" << ToString(ge::SUCCESS) - << " , tf session: " << tf_session_ << " ,graph id: " << graph_id; + << ctx->op_kernel().name() << ", tf session: " + << tf_session_ << ", graph id: " << graph_id; return Status::OK(); } // 空图直接返回 if (graph_handler_.graph->GetAllNodesSize() == 0UL) { ADP_LOG(INFO) << "[GEOP] End GeOp::ComputeAsync, compute_graph is empty, kernel_name:" << ctx->op_kernel().name() - << ", ret_status:" << ToString(ge::SUCCESS) << " , tf session: " << tf_session_ - << " ,graph id: " << graph_id; + << ", tf session: " << tf_session_ + << ", graph id: " << graph_id; is_empty_graph_ = true; return Status::OK(); } @@ -1269,7 +1264,7 @@ Status GeOp::CompileAndRunGraph(OpKernelContext *ctx, if ((graph_handler_.status != Init) && (!is_empty_graph_) && (shape_changed || IsGraphNeedRebuild(cache_graph_id))) { ADP_LOG(INFO) << "[GEOP] The graph need rebuild, graph id " - << cache_graph_id << " , need_change_precision_mode: " + << cache_graph_id << ", need_change_precision_mode: " << need_recover_precision_mode_; // 让进入需要Compiling状态时,其他线程需要等待此线程重编模型结束 graph_handler_.status = Compiling; @@ -1343,7 +1338,7 @@ void GeOp::ComputeAsync(OpKernelContext *ctx, DoneCallback done) { } } if (is_aoe_) { - ADP_LOG(INFO) << "[GEOP] in tuning func, aoe_mode:" << init_options_["ge.jobType"] + ADP_LOG(INFO) << "[GEOP] In tuning func, aoe_mode:" << init_options_["ge.jobType"] << ", work_path:" << init_options_["ge.tuningPath"] << ", distribute_config:" << init_options_["distribute_config"]; // aoe ini @@ -1432,7 +1427,7 @@ void GeOp::ChangeChannelNameAttr(NodeDef &node_def) const { channel_name.set_s(std::to_string( std::hash{}(tf_session_ + pre_channel_name + "_device_" + std::to_string(device_id)))); (*node_def.mutable_attr())["channel_name"] = channel_name; - ADP_LOG(INFO) << "[GEOP] changed the value of channel_name attr of node: " << node_def.name() << " to " + ADP_LOG(INFO) << "[GEOP] Changed the value of channel_name attr of node: " << node_def.name() << " to " << channel_name.s(); } @@ -1844,9 +1839,9 @@ Status GeOp::ParseOnnxGraphOpAttr(Node *&node) const { parser_params.insert({ge::AscendString(ge::ir_option::OUTPUT), ge::AscendString(subgrph_name.c_str())}); ge::Status status = ge::aclgrphParseONNX(model_path.c_str(), parser_params, sub_graph); if (status != ge::SUCCESS) { - ADP_LOG(ERROR) << "[GEOP] node: " << node->name() << ": Onnx model parse failed, ret: " << ToString(status); + ADP_LOG(ERROR) << "[GEOP] node: " << node->name() << ": Onnx model parse failed"; std::stringstream ss; - ss << "[GEOP] node: " << node->name() << ": Onnx model parse failed, ret: " << ToString(status) << std::endl + ss << "[GEOP] node: " << node->name() << ": Onnx model parse failed" << std::endl << "Error Message is : " << std::endl << ge::GEGetErrorMsgV2().GetString(); return errors::Internal(ss.str()); } diff --git a/tf_adapter/kernels/npu_sys_ctl_ops.cc b/tf_adapter/kernels/npu_sys_ctl_ops.cc index 759b6ffaa..de4ee424c 100644 --- a/tf_adapter/kernels/npu_sys_ctl_ops.cc +++ b/tf_adapter/kernels/npu_sys_ctl_ops.cc @@ -34,10 +34,6 @@ #include "tf_adapter/util/npu_attrs.h" namespace tensorflow { -inline string ToString(ge::Status status) { - return ::ge::StatusFactory::Instance()->GetErrDesc(status); -} - static const int64 kSecondToMillis = 1000000; static int64 GetCurrentTimestamp() { diff --git a/tf_adapter/tests/depends/ge_runner/src/ge_runner_stub.cc b/tf_adapter/tests/depends/ge_runner/src/ge_runner_stub.cc index adfdbe3d8..354754337 100644 --- a/tf_adapter/tests/depends/ge_runner/src/ge_runner_stub.cc +++ b/tf_adapter/tests/depends/ge_runner/src/ge_runner_stub.cc @@ -236,6 +236,7 @@ Status ParserFinalize() { if (!is_parser_init) { return ge::FAILED; } + is_parser_init = false; return ge::SUCCESS; } diff --git a/tf_adapter/tests/ut/kernels/testcase/geop_npu_test.cc b/tf_adapter/tests/ut/kernels/testcase/geop_npu_test.cc index e856fb82e..0d9949bf2 100644 --- a/tf_adapter/tests/ut/kernels/testcase/geop_npu_test.cc +++ b/tf_adapter/tests/ut/kernels/testcase/geop_npu_test.cc @@ -203,6 +203,7 @@ Status GeOpRunGraphAsyncMultiStep(std::string example_path, std::vectorGetErrDesc(status); -} void GeFinalize() { // 先等待可能的异步初始化结束 (void) GePlugin::GetInstance()->GetInitStatus(); // ge finalize ge::Status status = ge::GEFinalize(); if (status != ge::SUCCESS) { - ADP_LOG(ERROR) << "[GePlugin] GE finalize failed, ret : " << ToString(status); - LOG(ERROR) << "[GePlugin] GE finalize failed, ret : " << ToString(status) << std::endl + ADP_LOG(ERROR) << "[GePlugin] GE finalize failed"; + LOG(ERROR) << "[GePlugin] GE finalize failed" << std::endl << "Error Message is : " << std::endl << ge::GEGetErrorMsgV2().GetString(); } // parser finalize ge::Status status_parser = ge::ParserFinalize(); if (status_parser != ge::SUCCESS) { - ADP_LOG(ERROR) << "[GePlugin] Parser finalize failed, ret : " << ToString(status_parser); - LOG(ERROR) << "[GePlugin] Parser finalize failed, ret : " << ToString(status_parser); + ADP_LOG(ERROR) << "[GePlugin] Parser finalize failed"; + LOG(ERROR) << "[GePlugin] Parser finalize failed" << std::endl + << "Error Message is : " << std::endl << ge::GEGetErrorMsgV2().GetString(); } } @@ -312,9 +310,9 @@ void GePlugin::Init(std::map &init_options, const bool auto const init_options_ascend_string = ChangeStringToAscendString(init_options); ge::Status status_parser = ge::ParserInitialize(init_options_ascend_string); if (status_parser != ge::SUCCESS) { - std::this_thread::sleep_for(std::chrono::milliseconds(kFatalSleepTime)); - ADP_LOG(FATAL) << "[GePlugin] Initialize parser failed, ret : " << ToString(status_parser); - LOG(FATAL) << "[GePlugin] Initialize parser failed, ret : " << ToString(status_parser); + ADP_LOG(ERROR) << "[GePlugin] Initialize parser failed"; + LOG(ERROR) << "[GePlugin] Initialize parser failed" << std::endl + << "Error Message is : " << std::endl << ge::GEGetErrorMsgV2().GetString(); } ADP_LOG(INFO) << "[GePlugin] Initialize parser success."; auto ret = aclrtSetDevice(static_cast(device_id_)); @@ -335,10 +333,9 @@ void GePlugin::Init(std::map &init_options, const bool } else { ge::Status status = ge::GEInitialize(init_options_ascend_string); if (status != ge::SUCCESS) { - std::this_thread::sleep_for(std::chrono::milliseconds(kFatalSleepTime)); - ADP_LOG(FATAL) << "[GePlugin] Initialize ge failed, ret : " << ToString(status); + ADP_LOG(ERROR) << "[GePlugin] Initialize ge failed"; error_message_ = std::string(ge::GEGetErrorMsgV2().GetString()); - LOG(FATAL) << "[GePlugin] Initialize ge failed, ret : " << ToString(status) << std::endl + LOG(ERROR) << "[GePlugin] Initialize ge failed" << std::endl << "Error Message is : " << std::endl << error_message_; } @@ -527,8 +524,9 @@ void NpuClose() { int32_t InitRdmaPool(size_t size) { ge::Status ret = ge::InitRdmaPool(size); if (ret != ge::SUCCESS) { - ADP_LOG(ERROR) << "[GePlugin] init rdma pool failed, ret : " << ToString(ret); - LOG(ERROR) << "[GePlugin] init rdma pool failed, ret : " << ToString(ret); + ADP_LOG(ERROR) << "[GePlugin] init rdma pool failed"; + LOG(ERROR) << "[GePlugin] init rdma pool failed" << std::endl + << "Error Message is : " << std::endl << ge::GEGetErrorMsgV2().GetString(); return -1; } ADP_LOG(INFO) << "[GePlugin] init rdma pool success."; @@ -538,8 +536,9 @@ int32_t InitRdmaPool(size_t size) { int32_t RegistRdmaRemoteAddr(const std::vector &var_info) { ge::Status ret = ge::RdmaRemoteRegister(var_info); if (ret != ge::SUCCESS) { - ADP_LOG(ERROR) << "[GePlugin] rdma remote register failed, ret : " << ToString(ret); - LOG(ERROR) << "[GePlugin] rdma remote register failed, ret : " << ToString(ret); + ADP_LOG(ERROR) << "[GePlugin] rdma remote register failed"; + LOG(ERROR) << "[GePlugin] rdma remote register failed" << std::endl + << "Error Message is : " << std::endl << ge::GEGetErrorMsgV2().GetString(); return -1; } ADP_LOG(INFO) << "[GePlugin] rdma remote register success."; @@ -549,15 +548,17 @@ int32_t RegistRdmaRemoteAddr(const std::vector &var_info) { int32_t RdmaInitAndRegister(const std::vector &var_info, size_t size) { ge::Status ret = ge::InitRdmaPool(size); if (ret != ge::SUCCESS) { - ADP_LOG(ERROR) << "[GePlugin] init rdma pool failed, ret : " << ToString(ret); - LOG(ERROR) << "[GePlugin] init rdma pool failed, ret : " << ToString(ret); + ADP_LOG(ERROR) << "[GePlugin] init rdma pool failed"; + LOG(ERROR) << "[GePlugin] init rdma pool failed" << std::endl + << "Error Message is : " << std::endl << ge::GEGetErrorMsgV2().GetString(); return -1; } ADP_LOG(INFO) << "[GePlugin] init rdma pool success."; ret = ge::RdmaRemoteRegister(var_info); if (ret != ge::SUCCESS) { - ADP_LOG(ERROR) << "[GePlugin] rdma remote register failed, ret : " << ToString(ret); - LOG(ERROR) << "[GePlugin] rdma remote register failed, ret : " << ToString(ret); + ADP_LOG(ERROR) << "[GePlugin] rdma remote register failed"; + LOG(ERROR) << "[GePlugin] rdma remote register failed" << std::endl + << "Error Message is : " << std::endl << ge::GEGetErrorMsgV2().GetString(); return -1; } ADP_LOG(INFO) << "[GePlugin] rdma remote register success."; @@ -567,8 +568,9 @@ int32_t RdmaInitAndRegister(const std::vector &var_info, size_t int32_t GetVarAddrAndSize(const string &var_name, uint64_t &base_addr, uint64_t &var_size) { ge::Status ret = ge::GetVarBaseAddrAndSize(var_name.c_str(), base_addr, var_size); if (ret != ge::SUCCESS) { - ADP_LOG(ERROR) << "[GePlugin] get " << var_name << " base addr and size failed, ret : " << ToString(ret); - LOG(ERROR) << "[GePlugin] get " << var_name << " base addr and size failed, ret : " << ToString(ret); + ADP_LOG(ERROR) << "[GePlugin] get " << var_name << " base addr and size failed"; + LOG(ERROR) << "[GePlugin] get " << var_name << " base addr and size failed" << std::endl + << "Error Message is : " << std::endl << ge::GEGetErrorMsgV2().GetString(); return -1; } ADP_LOG(INFO) << "[GePlugin] get " << var_name << " base addr and size success."; @@ -578,8 +580,9 @@ int32_t GetVarAddrAndSize(const string &var_name, uint64_t &base_addr, uint64_t int32_t MallocSharedMem(const ge::TensorInfo &tensor_info, uint64_t &dev_addr, uint64_t &memory_size) { ge::Status ret = ge::MallocSharedMemory(tensor_info, dev_addr, memory_size); if (ret != ge::SUCCESS) { - ADP_LOG(ERROR) << "[GePlugin] malloc shared memory failed, ret : " << ToString(ret); - LOG(ERROR) << "[GePlugin] malloc shared memory failed, ret : " << ToString(ret); + ADP_LOG(ERROR) << "[GePlugin] malloc shared memory failed"; + LOG(ERROR) << "[GePlugin] malloc shared memory failed" << std::endl + << "Error Message is : " << std::endl << ge::GEGetErrorMsgV2().GetString(); return -1; } ADP_LOG(INFO) << "[GePlugin] malloc shared memory success."; @@ -589,8 +592,9 @@ int32_t MallocSharedMem(const ge::TensorInfo &tensor_info, uint64_t &dev_addr, u int32_t SetDeviceSatMode(uint32_t mode) { aclError ret = aclrtSetDeviceSatMode(aclrtFloatOverflowMode(mode)); if (ret != ACL_SUCCESS) { - ADP_LOG(ERROR) << "[GePlugin] set device sat mode failed, ret : " << ToString(ret); - LOG(ERROR) << "[GePlugin] set device sat mode failed, ret : " << ToString(ret); + ADP_LOG(ERROR) << "[GePlugin] set device sat mode failed"; + LOG(ERROR) << "[GePlugin] set device sat mode failed" << std::endl + << "Error Message is : " << std::endl << ge::GEGetErrorMsgV2().GetString(); return -1; } ADP_LOG(INFO) << "[GePlugin] set device sat mode success."; @@ -601,8 +605,9 @@ int32_t GetDeviceSatMode() { aclrtFloatOverflowMode floatOverflowMode = ACL_RT_OVERFLOW_MODE_UNDEF; aclError ret = aclrtGetDeviceSatMode(&floatOverflowMode); if (ret != ACL_SUCCESS) { - ADP_LOG(ERROR) << "[GePlugin] get device sat mode failed, ret : " << ToString(ret); - LOG(ERROR) << "[GePlugin] get device sat mode failed, ret : " << ToString(ret); + ADP_LOG(ERROR) << "[GePlugin] get device sat mode failed"; + LOG(ERROR) << "[GePlugin] get device sat mode failed" << std::endl + << "Error Message is : " << std::endl << ge::GEGetErrorMsgV2().GetString(); return -1; } ADP_LOG(INFO) << "[GePlugin] get device sat mode success."; -- Gitee