diff --git a/CMakeLists.txt b/CMakeLists.txt index c31967f320e9b3e25bf9373d6943910f624b9bff..6517e92513aa370dd4546db749c7ff768878c008 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -23,6 +23,7 @@ if (ENABLE_OPEN_SRC) include(${CMAKE_CURRENT_LIST_DIR}/cmake/tensorflow.cmake) include_directories(${CMAKE_CURRENT_LIST_DIR}) include_directories(${CMAKE_CURRENT_LIST_DIR}/inc) + include_directories(${CMAKE_CURRENT_LIST_DIR}/inc/toolchain) include_directories(${CMAKE_CURRENT_LIST_DIR}/inc/external) include_directories(${CMAKE_CURRENT_LIST_DIR}/inc/soft_dp) include_directories(${CMAKE_CURRENT_LIST_DIR}/inc/graphengine/inc) @@ -129,17 +130,29 @@ else() ${CMAKE_CURRENT_LIST_DIR}/tf_adapter/optimizers/*.cc ${CMAKE_CURRENT_LIST_DIR}/tf_adapter/util/*.cc ) + + add_custom_target(tensorflow_source ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/tensorflow_source.timestamp) + + add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/tensorflow_source.timestamp + COMMAND echo "cp tensorflow1.15 source begin:" + COMMAND rm -rf ${BASE_DIR}/tensorflow_15 && mkdir -p ${BASE_DIR}/tensorflow_15 + COMMAND cp -rfL ${TOP_DIR}/open_source/tensorflow ${BASE_DIR}/tensorflow_15 || echo skip + COMMAND cd ${BASE_DIR}/tensorflow_15/tensorflow && git checkout . && git fetch --all --tags && git checkout tags/v1.15.5 + COMMAND echo "end cp tensorflow1.15 source" + DEPENDS ${TOP_DIR}/open_source/tensorflow) + add_library(tf_adapter SHARED ${SOURCES} ${BASE_DIR}/tf_adapter/util/ge_plugin_wrap.cxx ) + add_dependencies(tf_adapter tensorflow_source) + target_include_directories(tf_adapter PRIVATE ${BASE_DIR}/ ${TOP_DIR}/inc/ ${TOP_DIR}/inc/external/ ${TOP_DIR}/inc/common/ - ${TOP_DIR}/inc/soft_dp/ ${TOP_DIR}/soft_dp/ ${TOP_DIR}/ace/execfwk/soft_dp/ ${TOP_DIR}/graphengine/inc/ @@ -150,8 +163,9 @@ else() ${TOP_DIR}/abl/libc_sec/include/ ${TOP_DIR}/third_party/json/include/ ${TOP_DIR}/open_source/json/include/ - ${TOP_DIR}/third_party/tensorflow/tensorflow-1.15.0/ - ${TOP_DIR}/third_party/tensorflow/compile_deps/tf-1.15.0/include/ + ${BASE_DIR}/tensorflow_15/tensorflow + /opt/buildtools/tensorflow-1.15.5/tensorflow_core/include/ + /opt/buildtools/tensorflow-1.15.5/tensorflow-1.15.5.data/purelib/tensorflow_core/include/ ${HI_PYTHON_INC}/ ) @@ -169,6 +183,11 @@ else() target_link_libraries(tf_adapter PUBLIC $ + $ + $ + $ + $ + $ -Wl,--no-as-needed c_sec ge_runner @@ -221,6 +240,7 @@ else() && rm -rf ${BASE_DIR}/libpywrap_tensorflow_internal.so && rm -rf ${BASE_DIR}/libtensorflow_framework.so.1 && rm -rf ${BASE_DIR}/libtensorflow_framework.so + && rm -rf ${BASE_DIR}/tensorflow_15/ && echo "package whl end" ) ###################################### Tensorflow 2.x ########################################### diff --git a/convert_tf2npu/ast_impl.py b/convert_tf2npu/ast_impl.py index ee5d2d191933f4a7090016ba433eb1fd126b8c2a..de87b24459320792cee4df1fd8a1af328ecd0465 100644 --- a/convert_tf2npu/ast_impl.py +++ b/convert_tf2npu/ast_impl.py @@ -32,16 +32,26 @@ def import_from(node): if "keras" in values: util_global.set_value('is_keras_net', True) if "horovod" in values: + log_msg(getattr(node, "lineno", "None"), "remove horovod import line to None") util_global.set_value('has_hccl_api', True) + new_node = ast.Expr(value=ast.NameConstant(value=None)) + ast.copy_location(new_node, node) + util_global.set_value('need_conver', True) + return new_node for value in node.names: if isinstance(value, ast.alias): values = value.name.split(".") if "keras" in values: util_global.set_value('is_keras_net', True) if "horovod" in values: + log_msg(getattr(node, "lineno", "None"), "remove horovod import line to None") util_global.set_value('has_hccl_api', True) + new_node = ast.Expr(value=ast.NameConstant(value=None)) + ast.copy_location(new_node, node) + util_global.set_value('need_conver', True) + return new_node util_global.set_value('need_conver', True) - + return node def ast_import(node): for value in node.names: @@ -50,8 +60,14 @@ def ast_import(node): if "keras" in values: util_global.set_value('is_keras_net', True) if "horovod" in values: + log_msg(getattr(node, "lineno", "None"), "remove horovod import line to None") util_global.set_value('has_hccl_api', True) - util_global.set_value('need_conver', True) + new_node = ast.Expr(value=ast.NameConstant(value=None)) + ast.copy_location(new_node, node) + util_global.set_value('need_conver', True) + return new_node + util_global.set_value('need_conver', True) + return node def ast_function_def(node): log_success_report(getattr(node, "lineno", "None"), node.name) @@ -81,19 +97,18 @@ def ast_if(node): args=[], keywords=[])) node.body = [keras_sess_assign] + node.body + [ast.Expr(value=close_sess_call)] util_global.set_value('need_conver', True) - if util_global.get_value("has_hccl_api", False): - log_msg(getattr(node, "lineno", "None"), " add npu resource init api") - close_sess_call = ast.Call(func=ast.Name(id="close_session", ctx=ast.Load()), - args=[ast.Name(id="npu_sess", ctx=ast.Load())], keywords=[]) - init_assign = ast.Assign(targets=[ast.Tuple(elts=[ast.Name(id="npu_sess", ctx=ast.Store()), - ast.Name(id="npu_shutdown", ctx=ast.Store())], - ctx=ast.Store())], - value=ast.Call(func=ast.Name(id="init_resource", ctx=ast.Load()), args=[], keywords=[])) - shutdown_call = ast.Call(func=ast.Name(id="shutdown_resource", ctx=ast.Load()), - args=[ast.Name(id="npu_sess", ctx=ast.Load()), ast.Name(id="npu_shutdown", ctx=ast.Load())], - keywords=[]) - node.body = [init_assign] + node.body + [ast.Expr(value=shutdown_call), ast.Expr(value=close_sess_call)] - util_global.set_value('need_conver', True) + log_msg(getattr(node, "lineno", "None"), " add npu resource init api") + close_sess_call = ast.Call(func=ast.Name(id="close_session", ctx=ast.Load()), + args=[ast.Name(id="npu_sess", ctx=ast.Load())], keywords=[]) + init_assign = ast.Assign(targets=[ast.Tuple(elts=[ast.Name(id="npu_sess", ctx=ast.Store()), + ast.Name(id="npu_shutdown", ctx=ast.Store())], + ctx=ast.Store())], + value=ast.Call(func=ast.Name(id="init_resource", ctx=ast.Load()), args=[], keywords=[])) + shutdown_call = ast.Call(func=ast.Name(id="shutdown_resource", ctx=ast.Load()), + args=[ast.Name(id="npu_sess", ctx=ast.Load()), ast.Name(id="npu_shutdown", ctx=ast.Load())], + keywords=[]) + node.body = [init_assign] + node.body + [ast.Expr(value=shutdown_call), ast.Expr(value=close_sess_call)] + util_global.set_value('need_conver', True) return node def convert_loss_scale_api(node): @@ -293,13 +308,20 @@ def ast_call(node): util_global.set_value('need_conver', True) return node if isinstance(node.func, ast.Attribute) and node.func.attr == "DistributedOptimizer": - log_success_report(getattr(node, "lineno", "None"), 'DistributedOptimizer') - return node.args[0] + log_msg(getattr(node, "lineno", "None"), 'change hvd.DistributedOptimizer to the input key optimzier') + opt_keyword = None + for keyword in node.keywords: + if keyword.arg == "optimizer": + opt_keyword = keyword + if opt_keyword is None: + return node.args[0] + else: + return opt_keyword.value if isinstance(node.func, ast.Attribute) and node.func.attr == 'shard': log_success_report(getattr(node, "lineno", "None"), 'shard') - node.args = [ast.Call(func=ast.Name(id='get_rank_size', ctx=ast.Load()), args=[], keywords=[]), - ast.Call(func=ast.Name(id='get_rank_id', ctx=ast.Load()), args=[], keywords=[])] - util_global.set_value("has_hccl_api", True) + node.args = [pasta.parse("int(os.getenv('RANK_SIZE', '1'))"), + pasta.parse("int(os.getenv('RANK_ID', '0'))")] + node.keywords.clear() util_global.set_value('need_conver', True) if isinstance(node.func, ast.Attribute) and node.func.attr == 'dropout': if isinstance(node.func.value, ast.Attribute) and node.func.value.attr == 'nn': @@ -315,6 +337,9 @@ def ast_call(node): for keyword in node.keywords: if keyword.arg != 'rate': keywords_new.append(keyword) + else: + keywords_new.append(ast.keyword(arg='keep_prob', value=ast.BinOp(left=ast.Num(n=1), op=ast.Sub(), + right=keyword.value))) node.keywords = keywords_new util_global.set_value('need_conver', True) if isinstance(node.func, ast.Attribute) and ((node.func.attr == 'map_and_batch') or (node.func.attr == 'batch' \ @@ -348,6 +373,17 @@ def ast_call(node): node.keywords = [] node.args = [] util_global.set_value('need_conver', True) + if (isinstance(node.func, ast.Attribute) and (node.func.attr == 'RunConfig')) and \ + (_call_name_match(node.func.value, 'estimator') or _call_name_match(node.func.value, 'tpu')): + save_summary_steps = None + for keyword in node.keywords: + if (keyword.arg == 'save_summary_steps'): + save_summary_steps = keyword + break + if len(node.args) < 3 and not save_summary_steps: + log_msg(getattr(node, 'lineno'), 'RunConfig() add save_summary_steps=0') + util_global.set_value('need_conver', True) + node.keywords.append(ast.keyword(arg='save_summary_steps', value=pasta.parse('0'))) if isinstance(node.func, ast.Attribute) and (node.func.attr == 'TPUEstimator') and \ ((isinstance(node.func.value, ast.Attribute) and (node.func.value.attr == 'tpu')) or \ (isinstance(node.func.value, ast.Name) and (node.func.value.id == 'tpu'))): @@ -543,38 +579,6 @@ def _call_name_match(call_func, call_name): return (isinstance(call_func, ast.Attribute) and (call_func.attr == call_name)) or \ (isinstance(call_func, ast.Name) and (call_func.id) == call_name) -def remove_hvd_import(r_node): - n = 0 - lenline = len(r_node.body) - - while n < lenline and not isinstance(r_node.body[n], ast.ImportFrom) and not isinstance(r_node.body[n], ast.Import): - n += 1 - - while n < lenline and (isinstance(r_node.body[n], ast.ImportFrom) or isinstance(r_node.body[n], ast.Import)): - if isinstance(r_node.body[n], ast.ImportFrom): - if r_node.body[n].module != None: - values = r_node.body[n].module.split(".") - if "horovod" in values: - log_msg(getattr(r_node.body[n], "lineno", "None"), " remove hvd import.") - r_node.body.pop(n) - lenline -= 1 - for value in r_node.body[n].names: - if isinstance(value, ast.alias): - values = value.name.split(".") - if "horovod" in values: - log_msg(getattr(r_node.body[n], "lineno", "None"), " remove hvd import.") - r_node.body.pop(n) - lenline -= 1 - elif isinstance(r_node.body[n], ast.Import): - for value in r_node.body[n].names: - if isinstance(value, ast.alias): - values = value.name.split(".") - if "horovod" in values: - log_msg(getattr(r_node.body[n], "lineno", "None"), " remove hvd import.") - r_node.body.pop(n) - lenline -= 1 - n += 1 - def insert_npu_import(r_node): npu_alias = ast.alias(name='*', asname=None) npu_import = ast.ImportFrom(module='npu_bridge.npu_init', names=[npu_alias], level=0) diff --git a/convert_tf2npu/conver_by_ast.py b/convert_tf2npu/conver_by_ast.py index b8c2a1ddedad17b4a45896ae188b669b16a4a6df..64f93b95c623db8179afce230dab01459f85f27e 100644 --- a/convert_tf2npu/conver_by_ast.py +++ b/convert_tf2npu/conver_by_ast.py @@ -53,13 +53,13 @@ class ConverByAst(ast.NodeTransformer): return node def visit_ImportFrom(self, node): - import_from(node) self.generic_visit(node) + node = import_from(node) return node def visit_Import(self, node): - ast_import(node) self.generic_visit(node) + node = ast_import(node) return node def visit_Assign(self, node): @@ -101,12 +101,9 @@ def conver_ast(path, out_path_dst, file_name): insert_npu_import(r_node) if not util_global.get_value('has_main_func', False) and (util_global.get_value('has_hccl_api', False) or util_global.get_value('is_keras_net', False)): - log_warning('the network of keras and horovod, or using dataset.shard script do not have main func, ' + log_warning('the network of keras and horovod script do not have main func, ' 'should set -m or --main parameter') - if util_global.get_value('has_main_func', False) and util_global.get_value('has_hccl_api', False): - remove_hvd_import(r_node) - if util_global.get_value('is_main_file', False) and util_global.get_value('has_hccl_api', False): - remove_hvd_import(r_node) + if util_global.get_value('is_main_file', False): insert_npu_resource_init(r_node) insert_npu_resource_shutdown(r_node) if util_global.get_value('is_main_file', False) and util_global.get_value('is_keras_net', False): diff --git a/convert_tf2npu/mappings/ast.py b/convert_tf2npu/mappings/ast.py index f2864da5b7594568b33af0efe278d33f49deff97..1d316029126ae8970811037d49111f84964e878f 100644 --- a/convert_tf2npu/mappings/ast.py +++ b/convert_tf2npu/mappings/ast.py @@ -4,9 +4,9 @@ "dropout": ["npu_ops", "tf.nn.dropout", "npu_ops.dropout"], "init": ["print", "hvd.init", "None"], "DistributedOptimizer": ["NPUDistributedOptimizer", "hvd.DistributedOptimizer", "NPUDistributedOptimizer"], -"rank": ["get_npu_rank_id", "hvd.rank", "get_npu_rank_id"], -"local_rank": ["get_npu_local_rank_id", "hvd.local_rank", "get_npu_local_rank_id"], -"size": ["get_npu_rank_size", "hvd.size", "get_npu_rank_size"], +"rank": ["get_npu_rank_id", "hvd.rank", "get_npu_rank_id"], +"local_rank": ["get_npu_local_rank_id", "hvd.local_rank", "get_npu_local_rank_id"], +"size": ["get_npu_rank_size", "hvd.size", "get_npu_rank_size"], "BroadcastGlobalVariablesHook": ["print", "hvd.BroadcastGlobalVariablesHook", "None"], "shard": ["", "dataset.shard(xxx, xxx)", "dataset.shard(get_rank_size(), get_rank_id())"], "EstimatorSpec": ["NPUEstimatorSpec", "tf.estimator.EstimatorSpec", "NPUEstimatorSpec"], @@ -49,7 +49,7 @@ "DNNLinearCombinedClassifier", "DNNLinearCombinedEstimator", "DNNLinearCombinedRegressor", "LinearClassifier", "LinearEstimator", "LinearRegressor"], "EstimatorFunc": ["train"], -"Session()": ["", "*.Session()", "*.Session(config=npu_session_config_init())"], +"Session()": ["", "*.*Session()", "*.*Session(config=npu_session_config_init())"], "ConfigProto()": ["", "*.ConfigProto()", "npu_config_proto(config_proto=*.ConfigProto())"], "GraphOptions()": ["", "*.GraphOptions()", "npu_graph_options(graph_options=*.GraphOptions())"], "OptimizerOptions()": ["", "*.OptimizerOptions()", "npu_optimizer_options(optimizer_options=*.OptimizerOptions())"], diff --git a/convert_tf2npu/util.py b/convert_tf2npu/util.py index 1034e227a125327583aa4abd29f42950c0f51859..190afe51e9e2c05812cec72e11f6371dd4e105eb 100644 --- a/convert_tf2npu/util.py +++ b/convert_tf2npu/util.py @@ -28,7 +28,7 @@ def log_info(lineno, msg, file): write_conver_report(content, file) def log_warning(msg): - content = "************" + msg + "************" + content = "WARNING:" + msg print(content) write_conver_report(content, util_global.get_value('report_file')[0]) diff --git a/inc/graphengine/inc/external/ge/ge_api_types.h b/inc/graphengine/inc/external/ge/ge_api_types.h index fbd6c020ef57633b6566e43f422f060d81f9c3e2..6f5bbfbfe032fa714ac281d7162c4528342cd08a 100644 --- a/inc/graphengine/inc/external/ge/ge_api_types.h +++ b/inc/graphengine/inc/external/ge/ge_api_types.h @@ -113,6 +113,7 @@ const char *const INPUT_FP16_NODES = "ge.INPUT_NODES_SET_FP16"; const char *const OP_DEBUG_LEVEL = "ge.opDebugLevel"; const char *const PERFORMANCE_MODE = "ge.performance_mode"; const char *const MODIFY_MIXLIST = "ge.exec.modify_mixlist"; +const char *const OP_PRECISION_MODE = "ge.exec.op_precision_mode"; } // namespace configure_option // Configure stream num by Session constructor options param, // its value should be int32_t type, default value is "1" @@ -326,6 +327,8 @@ const std::string PERFORMANCE_MODE = "ge.performance_mode"; const std::string MODIFY_MIXLIST = "ge.exec.modify_mixlist"; +const std::string OP_PRECISION_MODE = "ge.exec.op_precision_mode"; + // Graph run mode enum GraphRunMode { PREDICTION = 0, TRAIN }; @@ -405,6 +408,7 @@ static const char *const OP_BANK_UPDATE = ge::OP_BANK_UPDATE_FLAG.c_str(); static const char *const OP_DEBUG_LEVEL = ge::OP_DEBUG_LEVEL.c_str(); static const char *const PERFORMANCE_MODE = ge::PERFORMANCE_MODE.c_str(); static const char *const MODIFY_MIXLIST = ge::MODIFY_MIXLIST.c_str(); +static const char *const OP_PRECISION_MODE = ge::OP_PRECISION_MODE.c_str(); // for interface: aclgrphBuildModel #ifdef __GNUC__ @@ -416,6 +420,7 @@ const std::set ir_builder_suppported_options = {INPUT_FORMAT, DYNAMIC_IMAGE_SIZE, DYNAMIC_DIMS, INSERT_OP_FILE, + OP_PRECISION_MODE, PRECISION_MODE, TUNE_DEVICE_IDS, EXEC_DISABLE_REUSED_MEMORY, diff --git a/tf_adapter/interface_spec/api_npu_config.pyh b/tf_adapter/interface_spec/api_npu_config.pyh index 614a5a858ef9421f0e883e15378220c1ce0622f0..1674c35084affdb6207d41033b82b9c3f1e60ed0 100644 --- a/tf_adapter/interface_spec/api_npu_config.pyh +++ b/tf_adapter/interface_spec/api_npu_config.pyh @@ -15,7 +15,7 @@ class NPURunConfig(run_config_lib.RunConfig): op_compiler_cache_mode=None, op_compiler_cache_dir=None, debug_dir=None, hcom_multi_mode=False, dynamic_input=False, dynamic_graph_execute_mode="dynamic_execute", dynamic_inputs_shape_range=None, train_distribute=None, eval_distribute=None, local_rank_id=None, local_device_list=None, session_device_id=None, - distribute_config=None, modify_mixlist=None): + distribute_config=None, modify_mixlist=None, op_precision_mode=None): class ProfilingConfig(): def __init__(self, enable_profiling=False, profiling_options=None): diff --git a/tf_adapter/kernels/data_item_deliver.h b/tf_adapter/kernels/data_item_deliver.h index 39fd073e77eb16fefcae2c3c078f6df82295014c..8ebabcfe6fd334328da17e18c47d0f085b744c8b 100644 --- a/tf_adapter/kernels/data_item_deliver.h +++ b/tf_adapter/kernels/data_item_deliver.h @@ -45,8 +45,8 @@ limitations under the License. namespace tensorflow { namespace data { -static constexpr char *SOCKET_SERVER_PATH = "/tmp/server"; -static constexpr char *MESSAGE_HEAD = "head_check"; +static constexpr char const *SOCKET_SERVER_PATH = "/tmp/server"; +static constexpr char const *MESSAGE_HEAD = "head_check"; static constexpr int QLEN = 8; static constexpr int HEAD_INFO_SIZE = 3; static constexpr int ITEM_INFO_SIZE = 9; @@ -72,7 +72,7 @@ class DataItemDeliver { Status SendDataVec(std::vector &data_items, int fd); Status CreateSockAddr(struct sockaddr_un &sockaddr, const char *path, int local_rank_id); - int Recv(void *buffer, size_t data_len); + uint64_t Recv(void *buffer, size_t data_len); template Status GetDataLen(T &value, size_t size); Status GetTensorType(tdt::TdtDataType &data_type); @@ -83,12 +83,12 @@ class DataItemDeliver { mutex client_list_mu_; std::vector client_fd_list_; - int server_fd_; + int server_fd_ = -1; std::shared_ptr pools_; struct sockaddr_un local_addr_ = {0}; int local_rank_id_; - std::vector local_device_list_; uint32_t device_id_; + std::vector local_device_list_; std::string channel_name_; }; @@ -120,7 +120,7 @@ DataItemDeliver::~DataItemDeliver() { Status DataItemDeliver::ParallelInitSocketClient() { std::vector> init_status; - for (int i = 1; i < local_device_list_.size(); i++) { + for (size_t i = 1; i < local_device_list_.size(); i++) { init_status.emplace_back( pools_->Enqueue(&DataItemDeliver::InitSocketClient, this, i)); } @@ -227,7 +227,7 @@ Status DataItemDeliver::InitSocketServer() { Status DataItemDeliver::CheckHead(const char *check_value) { uint32_t head_size = 0; - int recvn = Recv(&head_size, UINT32_SIZE); + uint64_t recvn = Recv(&head_size, UINT32_SIZE); if (recvn != UINT32_SIZE) { ADP_LOG(ERROR) << "Failed to recv head length."; LOG(ERROR) << "Failed to recv head length."; @@ -290,7 +290,7 @@ Status DataItemDeliver::RecvDataVec(std::vector &items) { return Status::OK(); } -int DataItemDeliver::Recv(void *buffer, size_t data_len) { +uint64_t DataItemDeliver::Recv(void *buffer, size_t data_len) { int ret = -1; uint64_t buf_pos = 0; while (data_len > 0) { @@ -304,13 +304,13 @@ int DataItemDeliver::Recv(void *buffer, size_t data_len) { << ", channel_name:" << channel_name_; LOG(WARNING) << "Client connect closed, server_fd:" << server_fd_ << ", channel_name:" << channel_name_; - return ret; + return 0; } else if (ret < 0) { ADP_LOG(ERROR) << "Recv data failed,error:" << strerror(errno) << ", (errno:" << errno << "), server_fd:" << server_fd_; LOG(ERROR) << "Recv data failed,error:" << strerror(errno) << ", (errno:" << errno << "), server_fd:" << server_fd_; - return ret; + return 0; } buf_pos += ret; data_len -= ret; @@ -320,7 +320,7 @@ int DataItemDeliver::Recv(void *buffer, size_t data_len) { template Status DataItemDeliver::GetDataLen(T &value, size_t size) { - int recvn = Recv(&value, size); + uint64_t recvn = Recv(&value, size); if (recvn != size) { return errors::Internal("Failed to recv data length."); } @@ -328,7 +328,7 @@ Status DataItemDeliver::GetDataLen(T &value, size_t size) { } Status DataItemDeliver::GetTensorType(tdt::TdtDataType &data_type) { - int recvn = Recv(&data_type, UINT32_SIZE); + uint64_t recvn = Recv(&data_type, UINT32_SIZE); if (recvn != UINT32_SIZE) { return errors::Internal("Failed to recv data length."); } @@ -354,7 +354,7 @@ Status DataItemDeliver::GetTensorData(uint64_t &data_len, LOG(ERROR) << "Failed to reset buff memory. size:" << data_len; return errors::Internal("Failed to reset buff memory."); } - int recvn = Recv(buff, data_len); + uint64_t recvn = Recv(buff, data_len); if (recvn != data_len) { free(buff); ADP_LOG(ERROR) << "Failed to receive data."; @@ -384,7 +384,7 @@ Status DataItemDeliver::GetTensorString(std::string &str) { LOG(ERROR) << "Failed to reset buff memory."; return errors::Internal("Failed to reset buff memory."); } - int recvn = Recv(buff, size); + uint64_t recvn = Recv(buff, size); if (recvn != size) { free(buff); ADP_LOG(ERROR) << "Failed to receive data."; @@ -421,7 +421,7 @@ Status DataItemDeliver::SendDataVec(std::vector &data_items, uint32_t head_size = (strlen(MESSAGE_HEAD) + 1) * CHAR_SIZE; head_info[0].iov_base = &head_size; head_info[0].iov_len = UINT32_SIZE; - head_info[1].iov_base = MESSAGE_HEAD; + head_info[1].iov_base = const_cast(MESSAGE_HEAD); head_info[1].iov_len = head_size; head_info[2].iov_base = &vector_size; head_info[2].iov_len = UINT32_SIZE; @@ -469,9 +469,9 @@ Status DataItemDeliver::CreateSockAddr(struct sockaddr_un &sock_addr, const char *path, int device_id) { sock_addr.sun_family = AF_UNIX; int len = 0; - if (-1 == - (len = snprintf(sock_addr.sun_path, sizeof(sock_addr.sun_path), "%s%s%d", - path, channel_name_.c_str(), device_id))) { + if (-1 == (len = snprintf_s(sock_addr.sun_path, sizeof(sock_addr.sun_path), + sizeof(sock_addr.sun_path) - 1, "%s%s%d", path, + channel_name_.c_str(), device_id))) { ADP_LOG(ERROR) << "Set sun_path failed."; LOG(ERROR) << "Set sun_path failed."; return errors::Internal("Set sun_path failed."); @@ -494,4 +494,4 @@ void DataItemDeliver::SocketSend(struct iovec temp_items[], int vector_size, } } // namespace data } // namespace tensorflow -#endif \ No newline at end of file +#endif diff --git a/tf_adapter/kernels/geop_npu.cc b/tf_adapter/kernels/geop_npu.cc index f53502290ef4f194ea0e6bbe3c6ad57a4d12719d..de8513084199d89ca6acb74efbb0429f0880b2b1 100644 --- a/tf_adapter/kernels/geop_npu.cc +++ b/tf_adapter/kernels/geop_npu.cc @@ -528,7 +528,10 @@ void GeOp::ComputeAsync(OpKernelContext *ctx, DoneCallback done) { bool is_lazy_recompile_mode = dynamic_input_ == "1" && dynamic_graph_execute_mode_ == "lazy_recompile"; if (is_set_dynamic_config && is_tuning) { ADP_LOG(FATAL) << "dynamic input config can not use with mstuning."; - LOG(FATAL) << "dynamic input config can not use with mstuning."; + std::stringstream ss; + ss << "dynamic input config can not use with mstuning."; + OP_REQUIRES_ASYNC(ctx, false, errors::Internal(ss.str()), done); + return; } else if (is_set_dynamic_config && !is_tuning) { if (InitRebuildFlag(cache_graph_id) != 0) { OP_REQUIRES_ASYNC(ctx, false, errors::Internal("Failed to check rebuild flag"), done); @@ -731,7 +734,6 @@ void GeOp::ComputeAsync(OpKernelContext *ctx, DoneCallback done) { << ", graph id: " << cache_graph_id << std::endl << "Error Message is : " << std::endl << error_message; - LOG(FATAL) << ss.str(); OP_REQUIRES_ASYNC(ctx, status == ge::SUCCESS, errors::Unavailable(ss.str()), done); } else { add_graph_flag_ = true; @@ -781,21 +783,28 @@ void GeOp::ComputeAsync(OpKernelContext *ctx, DoneCallback done) { if (ge_status == ge::SUCCESS) { if (BuildOutputTensorInfo(ctx, outputs) != Status::OK()) { ADP_LOG(FATAL) << ctx->op_kernel().name() << " GEOP::DoRunAsync get output failed."; - LOG(FATAL) << ctx->op_kernel().name() << " GEOP::DoRunAsync get output failed."; + std::string error_message = ge::GEGetErrorMsg(); + std::stringstream ss; + ss << ctx->op_kernel().name() + << "GEOP::DoRunAsync get output failed." << std::endl + << "Error Message is : " << std::endl + << error_message; + OP_REQUIRES_ASYNC(ctx, false, errors::Internal(ss.str()), done); + return; } } else if (ge_status == ge::END_OF_SEQUENCE) { ctx->SetStatus(errors::OutOfRange("End of sequence")); ADP_LOG(WARNING) << "[GEOP] Out of range: End of sequence."; LOG(WARNING) << "[GEOP] Out of range: End of sequence."; } else if (ge_status != ge::SUCCESS) { - tensorflow::Status tfStatus = errors::Unavailable(ToString(ge_status)); - ctx->CtxFailureWithWarning(tfStatus); std::this_thread::sleep_for(std::chrono::milliseconds(kFatalSleepTime)); ADP_LOG(FATAL) << ctx->op_kernel().name() << "GEOP::::DoRunAsync Failed"; std::string error_message = ge::GEGetErrorMsg(); - LOG(FATAL) << ctx->op_kernel().name() << "GEOP::::DoRunAsync Failed" << std::endl - << "Error Message is : " << std::endl - << error_message; + std::stringstream ss; + ss << ctx->op_kernel().name() << "GEOP::::DoRunAsync Failed" << std::endl + << "Error Message is : " << std::endl << error_message; + OP_REQUIRES_ASYNC(ctx, false, errors::Internal(ss.str()), done); + return; } int64 run_end_time = InferShapeUtil::GetCurrentTimestap(); ADP_LOG(INFO) << "[GEOP] RunGraphAsync callback, status:" << ge_status << ", kernel_name:" @@ -818,7 +827,6 @@ void GeOp::ComputeAsync(OpKernelContext *ctx, DoneCallback done) { << ", graph id: " << cache_graph_id << std::endl << "Error Message is : " << std::endl << error_message; - LOG(FATAL) << ss.str(); OP_REQUIRES_ASYNC(ctx, status == ge::SUCCESS, errors::Unavailable(ss.str()), done); } @@ -1069,8 +1077,8 @@ void GeOp::AnalyzeInputDesc(void *tensor_ptr, ge::Tensor &input, ge::DataType ty ge_tensor_desc.SetPlacement(output_info->placement_); input.SetTensorDesc(ge_tensor_desc); - uint8_t* data = output_info->data_.get(); - input.SetData(output_info->data_.get(), output_info->output_size_, output_info->data_.get_deleter()); + uint8_t* data = output_info->data_.release(); + input.SetData(data, output_info->output_size_, output_info->data_.get_deleter()); ADP_LOG(INFO) << "[GEOP] Get input shape:" << input_shape.DebugString() << ", input placement:" << output_info->placement_ << ", input length:" << output_info->output_size_ diff --git a/tf_adapter/kernels/geop_npu.h b/tf_adapter/kernels/geop_npu.h index 730c30c164a76bf5861a606c7ff05649b459770b..e8f8eae70bfaac4b598412195645e172c116e6be 100644 --- a/tf_adapter/kernels/geop_npu.h +++ b/tf_adapter/kernels/geop_npu.h @@ -38,7 +38,7 @@ limitations under the License. #include "ge/ge_api_types.h" #include "graph/tensor.h" #include "graph/utils/graph_utils.h" -#include "toolchain/tuning_tool/tune_api.h" +#include "tuning_tool/tune_api.h" #include namespace tensorflow { diff --git a/tf_adapter/kernels/npu_sys_ctl_ops.cc b/tf_adapter/kernels/npu_sys_ctl_ops.cc index 3fd031fdc4f7f8e5406f3eb0c3cfa8c02306dda6..1d262d68920ed4b30eafb247362cee16774b32ca 100644 --- a/tf_adapter/kernels/npu_sys_ctl_ops.cc +++ b/tf_adapter/kernels/npu_sys_ctl_ops.cc @@ -38,7 +38,6 @@ limitations under the License. #include "framework/omg/parser/parser_api.h" #include "ge/ge_api.h" #include "ge/ge_api_types.h" -#include "hccl/hcom.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/platform/mutex.h" #include "tf_adapter/common/adp_logger.h" diff --git a/tf_adapter/optimizers/control_flow_conversion_pass.cc b/tf_adapter/optimizers/control_flow_conversion_pass.cc index 0f1e33c2a6e047cdb0a2e715b891b85b195b6be6..c6d7c9afe324e6c5151fe4e518d0315b6f8aae9b 100644 --- a/tf_adapter/optimizers/control_flow_conversion_pass.cc +++ b/tf_adapter/optimizers/control_flow_conversion_pass.cc @@ -69,6 +69,7 @@ Status ControlFlowConversionPass::Run(const GraphOptimizationPassOptions &option // Delete _lower_using_switch_merge before LowerFunctionalOpsPass for (int i = 2; i < graph->num_node_ids(); ++i) { Node *n = graph->FindNodeId(i); + if (n == nullptr) { continue; } if (n->IsIfNode() || n->type_string() == "Case" || n->IsWhileNode()) { n->ClearAttr(kLowerUsingSwitchMergeAttr); } } diff --git a/tf_adapter/optimizers/om_partition_subgraphs_pass.cc b/tf_adapter/optimizers/om_partition_subgraphs_pass.cc index e32965a0986a6cdcc0ff6971969551cf130db5f6..634958502fea0e827297a1dfeaf3cc09fc5a8a5c 100644 --- a/tf_adapter/optimizers/om_partition_subgraphs_pass.cc +++ b/tf_adapter/optimizers/om_partition_subgraphs_pass.cc @@ -2085,8 +2085,6 @@ Status OMPartitionSubgraphsPass::ProcessGraph(std::unique_ptr *graph, Fun break; } } - ADP_LOG(INFO) << "pass options:"; - NpuAttrs::LogOptions(pass_options); ADP_LOG(INFO) << "all options:"; NpuAttrs::LogOptions(all_options); diff --git a/tf_adapter/python/npu_bridge/estimator/npu/keras_to_npu.py b/tf_adapter/python/npu_bridge/estimator/npu/keras_to_npu.py index 577d0e1e729b65028fbd24ede5e16a9fe8f83cfc..708277bcb3d4569255bc10a0545a64a16d09a80d 100644 --- a/tf_adapter/python/npu_bridge/estimator/npu/keras_to_npu.py +++ b/tf_adapter/python/npu_bridge/estimator/npu/keras_to_npu.py @@ -42,6 +42,7 @@ from tensorflow.python.training import training_util from tensorflow.python.training.tracking import graph_view from tensorflow.python.training.tracking import util as trackable_util from tensorflow.python.util import compat_internal +from tensorflow_estimator.python.estimator import run_config from tensorflow_estimator.python.estimator import estimator as estimator_lib from tensorflow_estimator.python.estimator import model_fn as model_fn_lib from tensorflow_estimator.python.estimator.export import export_lib diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py index bab6797c571088c4aa7ca17abfd27eb7e970c2d6..68dd28e12a7ae61f5d88d14e55d3157540410653 100644 --- a/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py +++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_config.py @@ -87,7 +87,8 @@ class NPURunConfig(run_config_lib.RunConfig): local_device_list=None, session_device_id=None, distribute_config=None, - modify_mixlist=None + modify_mixlist=None, + op_precision_mode=None ): """ Constructs a NPUConfig. @@ -155,6 +156,7 @@ class NPURunConfig(run_config_lib.RunConfig): local_device_list: Available devices. distribute_config: Specify the NCA configuration file path modify_mixlist: Set the path of operator mixed precision configuration file. + op_precision_mode: Set the path of operator precision mode configuration file (.ini) """ # Check iterations_per_loop. @@ -236,6 +238,7 @@ class NPURunConfig(run_config_lib.RunConfig): self._session_device_id = session_device_id self._distribute_config = distribute_config self._modify_mixlist = modify_mixlist + self._op_precision_mode = op_precision_mode super(NPURunConfig, self).__init__( model_dir=model_dir, diff --git a/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py b/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py index eed5a44e4c08fbfb5fd3fdf4980ac40807b3eecc..2bbf1adc5510bd299da0cac3e2b2fb0a73761a64 100644 --- a/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py +++ b/tf_adapter/python/npu_bridge/estimator/npu/npu_estimator.py @@ -758,6 +758,8 @@ class NPUEstimator(estimator_lib.Estimator): custom_op.parameter_map["session_device_id"].i = config._session_device_id if config._modify_mixlist is not None: custom_op.parameter_map["modify_mixlist"].s = tf.compat.as_bytes(config._modify_mixlist) + if config._op_precision_mode is not None: + custom_op.parameter_map["op_precision_mode"].s = tf.compat.as_bytes(config._op_precision_mode) # add profiling options to custom_op self.__load_profiling_options(config, custom_op) diff --git a/tf_adapter/tests/ut/CMakeLists.txt b/tf_adapter/tests/ut/CMakeLists.txt index e6142f3a6743e7d6a589d4d2af0b4cde36ec76a9..b84900524df2acde831b4219285e4fe8e65f6ace 100644 --- a/tf_adapter/tests/ut/CMakeLists.txt +++ b/tf_adapter/tests/ut/CMakeLists.txt @@ -31,9 +31,6 @@ file(GLOB_RECURSE UT_SOURCES add_executable(tfadapter_utest "main.cc" ${UT_SOURCES} - #${TFADAPTER_DIR}/tf_adapter/kernels/geop_npu.cc - #${TFADAPTER_DIR}/tf_adapter/kernels/infeed_outfeed_ops.cc - #${TFADAPTER_DIR}/tf_adapter/kernels/npu_sys_ctl_ops.cc ) target_include_directories(tfadapter_utest PRIVATE @@ -57,7 +54,7 @@ foreach (UT_LINK_FLAG ${UT_LINK_FLAGS}) endforeach (UT_LINK_FLAG) string(STRIP ${PYTHON_LIB_PATH} PYTHON_LIB_PATH) -message("hrz python lib path------------${PYTHON_LIB_PATH}") +message("python lib path ${PYTHON_LIB_PATH}") add_dependencies(tfadapter_utest aoe_tuning) diff --git a/tf_adapter/tests/ut/kernels/pbtxt/geop_dynamic_config.pbtxt b/tf_adapter/tests/ut/kernels/pbtxt/geop_dynamic_config.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..b84ef13c042d1ea6ed3b397ec22a83fe829d0dfc --- /dev/null +++ b/tf_adapter/tests/ut/kernels/pbtxt/geop_dynamic_config.pbtxt @@ -0,0 +1,527 @@ +node { + name: "retval_Add1_0_0" + op: "_Retval" + input: "GeOp61_0" + device: "/job:localhost/replica:0/task:0/device:CPU:0" + attr { + key: "T" + value { + type: DT_FLOAT + } + } + attr { + key: "index" + value { + i: 0 + } + } +} +node { + name: "GeOp61_0" + op: "GeOp" + device: "/job:localhost/replica:0/task:0/device:CPU:0" + attr { + key: "Tin" + value { + list { + } + } + } + attr { + key: "Tout" + value { + list { + type: DT_FLOAT + } + } + } + attr { + key: "_NpuOptimizer" + value { + s: "NpuOptimizer" + } + } + attr { + key: "_auto_tune_mode" + value { + s: "" + } + } + attr { + key: "_buffer_optimize" + value { + s: "l2_optimize" + } + } + attr { + key: "_compress_weight_conf" + value { + s: "" + } + } + attr { + key: "_debug_dir" + value { + s: "" + } + } + attr { + key: "_distribute_config" + value { + s: "" + } + } + attr { + key: "_do_npu_optimizer" + value { + s: "1" + } + } + attr { + key: "_dump_debug_mode" + value { + s: "all" + } + } + attr { + key: "_dump_mode" + value { + s: "output" + } + } + attr { + key: "_dump_path" + value { + s: "./" + } + } + attr { + key: "_dump_step" + value { + s: "1" + } + } + attr { + key: "_dynamic_dims" + value { + s: "1,128;3,128;5,128" + } + } + attr { + key: "_dynamic_graph_execute_mode" + value { + s: "lazy_recompile" + } + } + attr { + key: "_dynamic_input" + value { + s: "1" + } + } + attr { + key: "_dynamic_node_type" + value { + s: "" + } + } + attr { + key: "_enable_compress_weight" + value { + s: "0" + } + } + attr { + key: "_enable_data_pre_proc" + value { + s: "0" + } + } + attr { + key: "_enable_dump" + value { + s: "1" + } + } + attr { + key: "_enable_dump_debug" + value { + s: "1" + } + } + attr { + key: "_enable_exception_dump" + value { + s: "" + } + } + attr { + key: "_enable_scope_fusion_passes" + value { + s: "" + } + } + attr { + key: "_enable_small_channel" + value { + s: "0" + } + } + attr { + key: "_fusion_switch_file" + value { + s: "" + } + } + attr { + key: "_graph_run_mode" + value { + s: "1" + } + } + attr { + key: "_hcom_multi_mode" + value { + s: "" + } + } + attr { + key: "_hcom_parallel" + value { + s: "0" + } + } + attr { + key: "_in_out_pair" + value { + s: "" + } + } + attr { + key: "_in_out_pair_flag" + value { + s: "1" + } + } + attr { + key: "_input_shape" + value { + s: "getnext:-1,-1" + } + } + attr { + key: "_is_tailing_optimization" + value { + s: "0" + } + } + attr { + key: "_iterations_per_loop" + value { + s: "1" + } + } + attr { + key: "_job" + value { + s: "localhost" + } + } + attr { + key: "_local_device_list" + value { + s: "" + } + } + attr { + key: "_local_rank_id" + value { + s: "-1" + } + } + attr { + key: "_lower_functional_ops" + value { + s: "0" + } + } + attr { + key: "_mix_compile_mode" + value { + s: "0" + } + } + attr { + key: "_mstune_mode" + value { + s: "" + } + } + attr { + key: "_op_compiler_cache_dir" + value { + s: "" + } + } + attr { + key: "_op_compiler_cache_mode" + value { + s: "" + } + } + attr { + key: "_op_debug_level" + value { + s: "0" + } + } + attr { + key: "_op_select_implmode" + value { + s: "" + } + } + attr { + key: "_op_tune_mode" + value { + s: "" + } + } + attr { + key: "_optypelist_for_implmode" + value { + s: "" + } + } + attr { + key: "_precision_mode" + value { + s: "" + } + } + attr { + key: "_profiling_mode" + value { + s: "0" + } + } + attr { + key: "_profiling_options" + value { + s: "" + } + } + attr { + key: "_session_device_id" + value { + s: "" + } + } + attr { + key: "_stream_max_parallel_num" + value { + s: "" + } + } + attr { + key: "_task_index" + value { + s: "0" + } + } + attr { + key: "_use_off_line" + value { + s: "1" + } + } + attr { + key: "_variable_format_optimize" + value { + s: "1" + } + } + attr { + key: "_work_path" + value { + s: "/home/ascend" + } + } + attr { + key: "_aoe_mode" + value { + s: "2" + } + } + attr { + key: "data_format" + value { + s: "NHWC" + } + } + attr { + key: "function" + value { + func { + name: "GeOp61_0" + } + } + } +} +library { + function { + signature { + name: "GeOp61_0" + output_arg { + name: "Add1_0_retval" + type: DT_FLOAT + } + } + node_def { + name: "Const_1" + op: "Const" + device: "/job:localhost/replica:0/task:0/device:CPU:0" + attr { + key: "_NpuOptimizer" + value { + s: "NpuOptimizer" + } + } + attr { + key: "_iterations_per_loop" + value { + s: "1" + } + } + attr { + key: "_job" + value { + s: "localhost" + } + } + attr { + key: "_mix_compile_mode" + value { + s: "0" + } + } + attr { + key: "_task_index" + value { + s: "0" + } + } + attr { + key: "_use_off_line" + value { + s: "1" + } + } + attr { + key: "dtype" + value { + type: DT_FLOAT + } + } + attr { + key: "value" + value { + tensor { + dtype: DT_FLOAT + tensor_shape { + dim { + size: 2 + } + } + tensor_content: "\000\000 A\000\000 A" + } + } + } + } + node_def { + name: "Variable" + op: "VariableV2" + device: "/job:localhost/replica:0/task:0/device:CPU:0" + attr { + key: "_class" + value { + list { + s: "loc:@Variable/read" + } + } + } + attr { + key: "_var_format" + value { + s: "4D" + } + } + attr { + key: "container" + value { + s: "" + } + } + attr { + key: "dtype" + value { + type: DT_FLOAT + } + } + attr { + key: "shape" + value { + shape { + dim { + size: 2 + } + } + } + } + attr { + key: "shared_name" + value { + s: "" + } + } + } + node_def { + name: "Variable/read" + op: "Identity" + input: "Variable:ref:0" + device: "/job:localhost/replica:0/task:0/device:CPU:0" + attr { + key: "T" + value { + type: DT_FLOAT + } + } + attr { + key: "_var_format" + value { + s: "4D" + } + } + } + node_def { + name: "Add1" + op: "Add" + input: "Const_1:output:0" + input: "Variable/read:output:0" + device: "/job:localhost/replica:0/task:0/device:CPU:0" + attr { + key: "T" + value { + type: DT_FLOAT + } + } + } + ret { + key: "Add1_0_retval" + value: "Add1:z:0" + } + } +} +versions { + producer: 134 +} diff --git a/tf_adapter/tests/ut/kernels/pbtxt/geop_output_error.pbtxt b/tf_adapter/tests/ut/kernels/pbtxt/geop_output_error.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..da7b36e42683dfdfbc0424f944229f6d8ba1f389 --- /dev/null +++ b/tf_adapter/tests/ut/kernels/pbtxt/geop_output_error.pbtxt @@ -0,0 +1,548 @@ +node { + name: "retval_Add_0_0" + op: "_Retval" + input: "GeOp51_0" + device: "/job:localhost/replica:0/task:0/device:CPU:0" + attr { + key: "T" + value { + type: DT_FLOAT + } + } + attr { + key: "index" + value { + i: 0 + } + } +} +node { + name: "retval_Add_1_0" + op: "_Retval" + input: "GeOp51_0:1" + device: "/job:localhost/replica:0/task:0/device:CPU:0" + attr { + key: "T" + value { + type: DT_FLOAT + } + } + attr { + key: "index" + value { + i: 0 + } + } +} +node { + name: "GeOp51_0" + op: "GeOp" + device: "/job:localhost/replica:0/task:0/device:CPU:0" + attr { + key: "Tin" + value { + list { + } + } + } + attr { + key: "Tout" + value { + list { + type: DT_FLOAT + type: DT_FLOAT + } + } + } + attr { + key: "_NpuOptimizer" + value { + s: "NpuOptimizer" + } + } + attr { + key: "_auto_tune_mode" + value { + s: "" + } + } + attr { + key: "_buffer_optimize" + value { + s: "l2_optimize" + } + } + attr { + key: "_compress_weight_conf" + value { + s: "" + } + } + attr { + key: "_debug_dir" + value { + s: "" + } + } + attr { + key: "_distribute_config" + value { + s: "" + } + } + attr { + key: "_do_npu_optimizer" + value { + s: "1" + } + } + attr { + key: "_dump_debug_mode" + value { + s: "all" + } + } + attr { + key: "_dump_mode" + value { + s: "output" + } + } + attr { + key: "_dump_path" + value { + s: "./" + } + } + attr { + key: "_dump_step" + value { + s: "1" + } + } + attr { + key: "_dynamic_dims" + value { + s: "" + } + } + attr { + key: "_dynamic_graph_execute_mode" + value { + s: "lazy_recompile" + } + } + attr { + key: "_dynamic_input" + value { + s: "0" + } + } + attr { + key: "_dynamic_node_type" + value { + s: "" + } + } + attr { + key: "_enable_compress_weight" + value { + s: "0" + } + } + attr { + key: "_enable_data_pre_proc" + value { + s: "0" + } + } + attr { + key: "_enable_dump" + value { + s: "1" + } + } + attr { + key: "_enable_dump_debug" + value { + s: "1" + } + } + attr { + key: "_enable_exception_dump" + value { + s: "" + } + } + attr { + key: "_enable_scope_fusion_passes" + value { + s: "" + } + } + attr { + key: "_enable_small_channel" + value { + s: "0" + } + } + attr { + key: "_fusion_switch_file" + value { + s: "" + } + } + attr { + key: "_graph_run_mode" + value { + s: "1" + } + } + attr { + key: "_hcom_multi_mode" + value { + s: "" + } + } + attr { + key: "_hcom_parallel" + value { + s: "0" + } + } + attr { + key: "_in_out_pair" + value { + s: "" + } + } + attr { + key: "_in_out_pair_flag" + value { + s: "1" + } + } + attr { + key: "_input_shape" + value { + s: "" + } + } + attr { + key: "_is_tailing_optimization" + value { + s: "0" + } + } + attr { + key: "_iterations_per_loop" + value { + s: "1" + } + } + attr { + key: "_job" + value { + s: "localhost" + } + } + attr { + key: "_local_device_list" + value { + s: "" + } + } + attr { + key: "_local_rank_id" + value { + s: "-1" + } + } + attr { + key: "_lower_functional_ops" + value { + s: "0" + } + } + attr { + key: "_mix_compile_mode" + value { + s: "0" + } + } + attr { + key: "_mstune_mode" + value { + s: "" + } + } + attr { + key: "_op_compiler_cache_dir" + value { + s: "" + } + } + attr { + key: "_op_compiler_cache_mode" + value { + s: "" + } + } + attr { + key: "_op_debug_level" + value { + s: "0" + } + } + attr { + key: "_op_select_implmode" + value { + s: "" + } + } + attr { + key: "_op_tune_mode" + value { + s: "" + } + } + attr { + key: "_optypelist_for_implmode" + value { + s: "" + } + } + attr { + key: "_precision_mode" + value { + s: "" + } + } + attr { + key: "_profiling_mode" + value { + s: "0" + } + } + attr { + key: "_profiling_options" + value { + s: "" + } + } + attr { + key: "_session_device_id" + value { + s: "" + } + } + attr { + key: "_stream_max_parallel_num" + value { + s: "" + } + } + attr { + key: "_task_index" + value { + s: "0" + } + } + attr { + key: "_use_off_line" + value { + s: "1" + } + } + attr { + key: "_variable_format_optimize" + value { + s: "1" + } + } + attr { + key: "_work_path" + value { + s: "" + } + } + attr { + key: "data_format" + value { + s: "NHWC" + } + } + attr { + key: "function" + value { + func { + name: "GeOp51_0" + } + } + } +} +library { + function { + signature { + name: "GeOp51_0" + output_arg { + name: "Add_0_retval" + type: DT_FLOAT + } + output_arg { + name: "Add_1_retval" + type: DT_FLOAT + } + } + node_def { + name: "Const_1" + op: "Const" + device: "/job:localhost/replica:0/task:0/device:CPU:0" + attr { + key: "_NpuOptimizer" + value { + s: "NpuOptimizer" + } + } + attr { + key: "_iterations_per_loop" + value { + s: "1" + } + } + attr { + key: "_job" + value { + s: "localhost" + } + } + attr { + key: "_mix_compile_mode" + value { + s: "0" + } + } + attr { + key: "_task_index" + value { + s: "0" + } + } + attr { + key: "_use_off_line" + value { + s: "1" + } + } + attr { + key: "dtype" + value { + type: DT_FLOAT + } + } + attr { + key: "value" + value { + tensor { + dtype: DT_FLOAT + tensor_shape { + dim { + size: 2 + } + } + tensor_content: "\000\000 A\000\000 A" + } + } + } + } + node_def { + name: "Variable" + op: "VariableV2" + device: "/job:localhost/replica:0/task:0/device:CPU:0" + attr { + key: "_class" + value { + list { + s: "loc:@Variable/read" + } + } + } + attr { + key: "_var_format" + value { + s: "4D" + } + } + attr { + key: "container" + value { + s: "" + } + } + attr { + key: "dtype" + value { + type: DT_FLOAT + } + } + attr { + key: "shape" + value { + shape { + dim { + size: 2 + } + } + } + } + attr { + key: "shared_name" + value { + s: "" + } + } + } + node_def { + name: "Variable/read" + op: "Identity" + input: "Variable:ref:0" + device: "/job:localhost/replica:0/task:0/device:CPU:0" + attr { + key: "T" + value { + type: DT_FLOAT + } + } + attr { + key: "_var_format" + value { + s: "4D" + } + } + } + node_def { + name: "Add" + op: "Add" + input: "Const_1:output:0" + input: "Variable/read:output:0" + device: "/job:localhost/replica:0/task:0/device:CPU:0" + attr { + key: "T" + value { + type: DT_FLOAT + } + } + } + ret { + key: "Add_0_retval" + value: "Add:z:0" + } + ret { + key: "Add_1_retval" + value: "Add:z:0" + } + } +} +versions { + producer: 134 +} diff --git a/tf_adapter/tests/ut/kernels/testcase/geop_npu_test.cc b/tf_adapter/tests/ut/kernels/testcase/geop_npu_test.cc index 727b71d3e088d746cc0697840c4e30e6841e077c..3d49a063cff73f82ba1450c6e24fe1f1cd294401 100644 --- a/tf_adapter/tests/ut/kernels/testcase/geop_npu_test.cc +++ b/tf_adapter/tests/ut/kernels/testcase/geop_npu_test.cc @@ -88,11 +88,9 @@ Status GeOpRunGraphAsync(std::string example_path, gtl::InlinedVector(¶ms); AsyncOpKernel::DoneCallback done = []() { LOG(INFO) << "DONE DoneCallback"; }; async_op->ComputeAsync(ctx.get(), done); - EXPECT_EQ(ctx->status().ok(), true); if (!only_run_once) { auto ctx1 = absl::make_unique(¶ms); async_op->ComputeAsync(ctx1.get(), done); - EXPECT_EQ(ctx1->status().ok(), true); } } } @@ -105,6 +103,18 @@ TEST_F(GeOpTest, GeOpFuncTest) { gtl::InlinedVector inputs; EXPECT_TRUE(GeOpRunGraphAsync(graph_def_path, inputs, node_def, "GeOp1_0").ok()); } +TEST_F(GeOpTest, GeDynamicConfigError) { + NodeDef node_def; + std::string graph_def_path = "tf_adapter/tests/ut/kernels/pbtxt/geop_dynamic_config.pbtxt"; + gtl::InlinedVector inputs; + EXPECT_TRUE(GeOpRunGraphAsync(graph_def_path, inputs, node_def, "GeOp61_0").ok()); +} +TEST_F(GeOpTest, GeOpOutputError) { + NodeDef node_def; + std::string graph_def_path = "tf_adapter/tests/ut/kernels/pbtxt/geop_output_error.pbtxt"; + gtl::InlinedVector inputs; + EXPECT_TRUE(GeOpRunGraphAsync(graph_def_path, inputs, node_def, "GeOp51_0").ok()); +} TEST_F(GeOpTest, GeOpVarInitGraphTest) { NodeDef node_def; std::string graph_def_path = "tf_adapter/tests/ut/kernels/pbtxt/geop_var_init_graph.pbtxt"; diff --git a/tf_adapter/util/ge_plugin.cc b/tf_adapter/util/ge_plugin.cc index a369fbef13b69eafd564f8306bd2d392ac2b73d0..a74f3be574d4198780799ce4be9f692b85a1bcee 100644 --- a/tf_adapter/util/ge_plugin.cc +++ b/tf_adapter/util/ge_plugin.cc @@ -212,6 +212,8 @@ void GePlugin::Init(std::map &init_options, bool is_gl << ", work path : " << init_options["ge.tuningPath"] << ", distribute_config : " << init_options["distribute_config"]; + ADP_LOG(INFO) << "[GePlugin] fusion_switch_file :" << init_options["ge.fusionSwitchFile"]; + const char *tdt_uninit_env = std::getenv("ASCEND_TDT_UNINIT"); bool tdt_init = true; if (tdt_uninit_env != nullptr && std::atoi(tdt_uninit_env) == 1) { diff --git a/tf_adapter/util/npu_attrs.cc b/tf_adapter/util/npu_attrs.cc index 0e87e5d470507dc5bee45ec089e67fa0a142ef93..b1ac0b8625a07d820e8e7e79dbfcddae0449e836 100644 --- a/tf_adapter/util/npu_attrs.cc +++ b/tf_adapter/util/npu_attrs.cc @@ -317,6 +317,8 @@ std::map NpuAttrs::GetSessOptions(OpKernelConstruction std::string dynamic_node_type; std::string session_device_id; std::string modify_mixlist; + std::string op_precision_mode; + std::string graph_run_mode = "1"; if (ctx != nullptr && ctx->GetAttr("_NpuOptimizer", &npuOptimizer) == Status::OK()) { ctx->GetAttr("_variable_format_optimize", &variable_format_optimize); @@ -367,6 +369,8 @@ std::map NpuAttrs::GetSessOptions(OpKernelConstruction ctx->GetAttr("_dynamic_node_type", &dynamic_node_type); ctx->GetAttr("_session_device_id", &session_device_id); ctx->GetAttr("_modify_mixlist", &modify_mixlist); + ctx->GetAttr("_op_precision_mode", &op_precision_mode); + ctx->GetAttr("_graph_run_mode", &graph_run_mode); } // session options @@ -396,6 +400,8 @@ std::map NpuAttrs::GetSessOptions(OpKernelConstruction sess_options["ge.session_device_id"] = session_device_id; } sess_options[ge::MODIFY_MIXLIST] = modify_mixlist; + sess_options["ge.exec.op_precision_mode"] = op_precision_mode; + sess_options[ge::OPTION_GRAPH_RUN_MODE] = graph_run_mode; return sess_options; } @@ -435,6 +441,7 @@ std::map NpuAttrs::GetInitOptions(OpKernelConstruction std::string work_path; std::string distribute_config; std::string modify_mixlist; + std::string fusion_switch_file; if (ctx != nullptr && ctx->GetAttr("_NpuOptimizer", &npuOptimizer) == Status::OK()) { ctx->GetAttr("_precision_mode", &precision_mode); @@ -453,6 +460,7 @@ std::map NpuAttrs::GetInitOptions(OpKernelConstruction ctx->GetAttr("_hcom_multi_mode", &hcom_multi_mode); ctx->GetAttr("_distribute_config", &distribute_config); ctx->GetAttr("_modify_mixlist", &modify_mixlist); + ctx->GetAttr("_fusion_switch_file", &fusion_switch_file); } @@ -476,6 +484,7 @@ std::map NpuAttrs::GetInitOptions(OpKernelConstruction init_options["ge.debugDir"] = debug_dir; init_options["ge.hcomMultiMode"] = hcom_multi_mode; init_options[ge::MODIFY_MIXLIST] = modify_mixlist; + init_options["ge.fusionSwitchFile"] = fusion_switch_file; return init_options; } @@ -768,6 +777,7 @@ std::map NpuAttrs::GetAllAttrOptions(AttrSlice attrs) std::string hcom_multi_mode; std::string session_device_id; std::string modify_mixlist; + std::string op_precision_mode; if (attrs.Find("_NpuOptimizer") != nullptr) { do_npu_optimizer = std::to_string(true); @@ -904,6 +914,9 @@ std::map NpuAttrs::GetAllAttrOptions(AttrSlice attrs) if (attrs.Find("_modify_mixlist") != nullptr) { modify_mixlist = attrs.Find("_modify_mixlist")->s(); } + if (attrs.Find("_op_precision_mode") != nullptr) { + op_precision_mode = attrs.Find("_op_precision_mode")->s(); + } } all_options["variable_format_optimize"] = variable_format_optimize; @@ -960,6 +973,7 @@ std::map NpuAttrs::GetAllAttrOptions(AttrSlice attrs) all_options["hcom_multi_mode"] = hcom_multi_mode; all_options["session_device_id"] = session_device_id; all_options["modify_mixlist"] = modify_mixlist; + all_options["op_precision_mode"] = op_precision_mode; return all_options; } @@ -1037,6 +1051,7 @@ Status NpuAttrs::SetNpuOptimizerAttr(const GraphOptimizationPassOptions &options bool hcom_multi_mode = false; int session_device_id = -1; std::string modify_mixlist; + std::string op_precision_mode; const RewriterConfig &rewrite_options = options.session_options->config.graph_options().rewrite_options(); for (const auto &custom_optimizer : rewrite_options.custom_optimizers()) { @@ -1293,6 +1308,9 @@ Status NpuAttrs::SetNpuOptimizerAttr(const GraphOptimizationPassOptions &options return errors::Internal("modify_mixlist is assigned, please ensure that precision_mode is assigned to 'allow_mix_precision'."); } } + if (params.count("op_precision_mode")) { + op_precision_mode = params.at("op_precision_mode").s(); + } } } @@ -1323,6 +1341,7 @@ Status NpuAttrs::SetNpuOptimizerAttr(const GraphOptimizationPassOptions &options sess_options["hcom_multi_mode"] = std::to_string(hcom_multi_mode); sess_options["session_device_id"] = std::to_string(session_device_id); sess_options["modify_mixlist"] = modify_mixlist; + sess_options["op_precision_mode"] = op_precision_mode; init_options["precision_mode"] = precision_mode; init_options["profiling_mode"] = std::to_string(profiling_mode); diff --git a/tf_adapter_2.x/cmake/tensorflow/module.cmake b/tf_adapter_2.x/cmake/tensorflow/module.cmake index 10e1350f30655cd484a58ca5355b7a7784a6a31b..7949da51866317f560ca4a69c373139f081a4c91 100644 --- a/tf_adapter_2.x/cmake/tensorflow/module.cmake +++ b/tf_adapter_2.x/cmake/tensorflow/module.cmake @@ -19,7 +19,7 @@ else() add_library(pywrap_tensorflow_internal SHARED ${fake_sources}) set_target_properties(pywrap_tensorflow_internal PROPERTIES PREFIX _) - SET(TF_INCLUDE_DIR ${ASCEND_CI_BUILD_DIR}/third_party/tensorflow/compile_deps/tf-2.4.0/include/org) + SET(TF_INCLUDE_DIR /opt/buildtools/tensorflow-2.4.1/tensorflow/include/) target_link_libraries(tensorflow_libs INTERFACE tensorflow_framework pywrap_tensorflow_internal) diff --git a/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp b/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp index 2fe4f3a5d81868f3247ad6657c364cd5d1503d21..c4441d04ddc2be3b3d40e526ee209333664086c1 100644 --- a/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp +++ b/tf_adapter_2.x/npu_device/core/npu_wrapper.cpp @@ -72,6 +72,7 @@ const std::map kConfigurableOptions = { {"is_tailing_optimization", ge::OPTION_EXEC_ENABLE_TAILING_OPTIMIZATION}, {"op_debug_level", ge::OP_DEBUG_LEVEL}, {"debug_dir", ge::DEBUG_DIR}, + {"modify_mixlist", ge::MODIFY_MIXLIST}, {"enable_exception_dump", ge::OPTION_EXEC_ENABLE_EXCEPTION_DUMP}, {"enable_dump", ge::OPTION_EXEC_ENABLE_DUMP}, {"dump_path", ge::OPTION_EXEC_DUMP_PATH}, diff --git a/tf_adapter_2.x/python/npu_device/__init__.py b/tf_adapter_2.x/python/npu_device/__init__.py index 1eaf9267f21caf1ca274e443ec682c3b4f336e5f..7478065407394d6b18e4ac9eb0e22cb27ebda377 100644 --- a/tf_adapter_2.x/python/npu_device/__init__.py +++ b/tf_adapter_2.x/python/npu_device/__init__.py @@ -3,6 +3,8 @@ from npu_device.npu_device import never_nested_function from npu_device.npu_device import gen_npu_ops from npu_device.npu_device import global_options +from npu_device.utils.scope import keep_dtype_scope + from npu_device._api import distribute from npu_device._api import train from npu_device._api import ops diff --git a/tf_adapter_2.x/python/npu_device/configs/npu_config.py b/tf_adapter_2.x/python/npu_device/configs/npu_config.py index 35e286fe0e045106ba23f658210f6ad3d170b8f7..1e2b0c54dedabe32bb79145f1c95224d800809eb 100644 --- a/tf_adapter_2.x/python/npu_device/configs/npu_config.py +++ b/tf_adapter_2.x/python/npu_device/configs/npu_config.py @@ -27,6 +27,7 @@ class NpuConfig(NpuBaseConfig): self.is_tailing_optimization = OptionValue(False, [True, False]) self.op_debug_level = OptionValue(0, [0, 1, 2, 3]) self.debug_dir = OptionValue(None, None) + self.modify_mixlist = OptionValue(None, None) self.enable_exception_dump = OptionValue(0, [0, 1]) self.dump_config = NpuDumpConfig() self.profiling_config = NpuProfilingConfig() diff --git a/tf_adapter_2.x/python/npu_device/distribute/hccl.py b/tf_adapter_2.x/python/npu_device/distribute/hccl.py index 4b32864f98f7323766c86b6e8c9ff542471530bf..63f44b0ca423e17fb7cabb7027774e5ffa0a1109 100644 --- a/tf_adapter_2.x/python/npu_device/distribute/hccl.py +++ b/tf_adapter_2.x/python/npu_device/distribute/hccl.py @@ -46,7 +46,7 @@ def _all_reduce(values, reduction, fusion, fusion_id, group): return reduced_values -def all_reduce(values, reduction, fusion=1, fusion_id=-1, group="hccl_world_group"): +def all_reduce(values, reduction="mean", fusion=1, fusion_id=-1, group="hccl_world_group"): if global_npu_ctx() is None or not global_npu_ctx().is_cluster_worker(): logging.info("Skip all reduce as current process is not npu cluster worker") return values @@ -63,7 +63,7 @@ def _broadcast(values, root_rank, fusion, fusion_id, group): value.assign(hccl_ops.broadcast([value], root_rank, fusion, fusion_id, group)[0]) -def broadcast(values, root_rank, fusion=2, fusion_id=0, group="hccl_world_group"): +def broadcast(values, root_rank=0, fusion=2, fusion_id=0, group="hccl_world_group"): if global_npu_ctx() is None or not global_npu_ctx().is_cluster_worker(): logging.info("Skip broadcast as current process is not npu cluster worker") return diff --git a/tf_adapter_2.x/python/npu_device/npu_device.py b/tf_adapter_2.x/python/npu_device/npu_device.py index 48724abc07d03471126f31832fc340faa9522a5d..0c3f3a4be07bd7ef8d73392dab52e272084aac0f 100644 --- a/tf_adapter_2.x/python/npu_device/npu_device.py +++ b/tf_adapter_2.x/python/npu_device/npu_device.py @@ -137,7 +137,11 @@ def never_nested_function(func=None, *args, **kwargs): def never_nested_decorator(f): if kwargs.get('experimental_compile'): logging.info("Skip xla compile tf function %s on npu", f.__name__) - kwargs['experimental_compile'] = False + kwargs['experimental_compile'] = False + if kwargs.get('jit_compile'): + logging.info("Skip xla compile tf function %s on npu", f.__name__) + kwargs['jit_compile'] = False + tf_decorated_func = _hacked_tensorflow_function(*args, **kwargs)(f) def wrapper(*func_args, **func_kwargs): diff --git a/tf_adapter_2.x/python/npu_device/utils/scope.py b/tf_adapter_2.x/python/npu_device/utils/scope.py new file mode 100644 index 0000000000000000000000000000000000000000..c175bb173ad5dea1a4ff1af22d1ff5dd79ae40dd --- /dev/null +++ b/tf_adapter_2.x/python/npu_device/utils/scope.py @@ -0,0 +1,9 @@ +from tensorflow.python.framework import ops +from tensorflow.python.util import tf_contextlib +from tensorflow.core.framework import attr_value_pb2 + + +@tf_contextlib.contextmanager +def keep_dtype_scope(): + with ops.get_default_graph()._attr_scope({'_keep_dtype': attr_value_pb2.AttrValue(b=True)}): + yield